{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4c415cae-9ab3-4f34-83d2-2609f63af97a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import pytz\n",
    "from scapy.all import Ether, CookedLinux, Raw\n",
    "import re\n",
    "import logging\n",
    "import pyarrow\n",
    "import random\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e53a1d02-f035-44d8-a9ab-4ddc468393f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "edt = pytz.timezone('US/Eastern')\n",
    "def write_log(message):\n",
    "  current_time = str(datetime.now(edt).strftime('%Y-%m-%d %H:%M:%S'))\n",
    "  f = open(\"LOG_CICIDS.txt\", \"a\")\n",
    "  f.write(current_time + ' : ' + message + '\\n')\n",
    "  f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "568011f8-ceab-49ee-8d42-60e70157741f",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Create CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "68edd80d-5058-4483-a671-869898c04af5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1178853/2119090706.py:1: DtypeWarning: Columns (16,18) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv('CICIDS.csv')\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('CICIDS.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c16c6f6b-e425-4f62-8446-ef56aaaace19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>t_delta</th>\n",
       "      <th>stime_flow</th>\n",
       "      <th>duration</th>\n",
       "      <th>attack_cat</th>\n",
       "      <th>offset</th>\n",
       "      <th>srcip_flow</th>\n",
       "      <th>sport_flow</th>\n",
       "      <th>dstip_flow</th>\n",
       "      <th>dsport_flow</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1499082959</td>\n",
       "      <td>8.254.250.126</td>\n",
       "      <td>80</td>\n",
       "      <td>192.168.10.5</td>\n",
       "      <td>49188</td>\n",
       "      <td>tcp</td>\n",
       "      <td>55</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f360a8b00c1b114eb310800450000281b34000037...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499083e+09</td>\n",
       "      <td>4.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1499082959</td>\n",
       "      <td>8.254.250.126</td>\n",
       "      <td>80</td>\n",
       "      <td>192.168.10.5</td>\n",
       "      <td>49188</td>\n",
       "      <td>tcp</td>\n",
       "      <td>55</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f360a8b00c1b114eb310800450000281b34000037...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499083e+09</td>\n",
       "      <td>1.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1499082982</td>\n",
       "      <td>8.253.185.121</td>\n",
       "      <td>80</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>49486</td>\n",
       "      <td>tcp</td>\n",
       "      <td>55</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb310800450000284c22000037...</td>\n",
       "      <td>...</td>\n",
       "      <td>23.0</td>\n",
       "      <td>1.499083e+09</td>\n",
       "      <td>3.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1499082982</td>\n",
       "      <td>8.253.185.121</td>\n",
       "      <td>80</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>49486</td>\n",
       "      <td>tcp</td>\n",
       "      <td>55</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb310800450000284c22000037...</td>\n",
       "      <td>...</td>\n",
       "      <td>23.0</td>\n",
       "      <td>1.499083e+09</td>\n",
       "      <td>1.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1499082982</td>\n",
       "      <td>8.253.185.121</td>\n",
       "      <td>80</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>49486</td>\n",
       "      <td>tcp</td>\n",
       "      <td>55</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb310800450000284c22000037...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499083e+09</td>\n",
       "      <td>3.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287885</th>\n",
       "      <td>1499457761</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128</td>\n",
       "      <td>40</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee0800450000284980400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>113.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287886</th>\n",
       "      <td>1499457761</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64</td>\n",
       "      <td>60</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba808004500003cd6ba400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>191310.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287887</th>\n",
       "      <td>1499457761</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64</td>\n",
       "      <td>52</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba8080045000034d6bb400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>191310.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287888</th>\n",
       "      <td>1499457761</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>10399</td>\n",
       "      <td>184.84.243.218</td>\n",
       "      <td>80</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128</td>\n",
       "      <td>41</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000297740400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>20082634.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287889</th>\n",
       "      <td>1499457761</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581</td>\n",
       "      <td>tcp</td>\n",
       "      <td>49</td>\n",
       "      <td>52</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f360ba800c1b114eb31080045280034903b400031...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>191310.0</td>\n",
       "      <td>BENIGN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>91287890 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime          srcip  sport           dstip  dsport protocol_m  \\\n",
       "0         1499082959  8.254.250.126     80    192.168.10.5   49188        tcp   \n",
       "1         1499082959  8.254.250.126     80    192.168.10.5   49188        tcp   \n",
       "2         1499082982  8.253.185.121     80   192.168.10.14   49486        tcp   \n",
       "3         1499082982  8.253.185.121     80   192.168.10.14   49486        tcp   \n",
       "4         1499082982  8.253.185.121     80   192.168.10.14   49486        tcp   \n",
       "...              ...            ...    ...             ...     ...        ...   \n",
       "91287885  1499457761  192.168.10.14  59111   23.10.108.151     443        tcp   \n",
       "91287886  1499457761  192.168.10.51  59581   162.213.33.50     443        tcp   \n",
       "91287887  1499457761  192.168.10.51  59581   162.213.33.50     443        tcp   \n",
       "91287888  1499457761   192.168.10.8  10399  184.84.243.218      80        tcp   \n",
       "91287889  1499457761  162.213.33.50    443   192.168.10.51   59581        tcp   \n",
       "\n",
       "          sttl  total_len first_layer  \\\n",
       "0           55         40    Ethernet   \n",
       "1           55         40    Ethernet   \n",
       "2           55         40    Ethernet   \n",
       "3           55         40    Ethernet   \n",
       "4           55         40    Ethernet   \n",
       "...        ...        ...         ...   \n",
       "91287885   128         40    Ethernet   \n",
       "91287886    64         60    Ethernet   \n",
       "91287887    64         52    Ethernet   \n",
       "91287888   128         41    Ethernet   \n",
       "91287889    49         52    Ethernet   \n",
       "\n",
       "                                                     packet  ... t_delta  \\\n",
       "0         b8ac6f360a8b00c1b114eb310800450000281b34000037...  ...     0.0   \n",
       "1         b8ac6f360a8b00c1b114eb310800450000281b34000037...  ...     0.0   \n",
       "2         b8ac6f3607ee00c1b114eb310800450000284c22000037...  ...    23.0   \n",
       "3         b8ac6f3607ee00c1b114eb310800450000284c22000037...  ...    23.0   \n",
       "4         b8ac6f3607ee00c1b114eb310800450000284c22000037...  ...     0.0   \n",
       "...                                                     ...  ...     ...   \n",
       "91287885  00c1b114eb31b8ac6f3607ee0800450000284980400080...  ...     0.0   \n",
       "91287886  00c1b114eb31b8ac6f360ba808004500003cd6ba400040...  ...     0.0   \n",
       "91287887  00c1b114eb31b8ac6f360ba8080045000034d6bb400040...  ...     0.0   \n",
       "91287888  00c1b114eb31b8ac6f3608f50800450000297740400080...  ...     1.0   \n",
       "91287889  b8ac6f360ba800c1b114eb31080045280034903b400031...  ...     0.0   \n",
       "\n",
       "            stime_flow    duration  attack_cat offset     srcip_flow  \\\n",
       "0         1.499083e+09         4.0      BENIGN    1.0            NaN   \n",
       "1         1.499083e+09         1.0      BENIGN    1.0            NaN   \n",
       "2         1.499083e+09         3.0      BENIGN    1.0            NaN   \n",
       "3         1.499083e+09         1.0      BENIGN    1.0            NaN   \n",
       "4         1.499083e+09         3.0      BENIGN    1.0            NaN   \n",
       "...                ...         ...         ...    ...            ...   \n",
       "91287885  1.499458e+09       113.0      BENIGN   60.0            NaN   \n",
       "91287886  1.499458e+09    191310.0      BENIGN   60.0            NaN   \n",
       "91287887  1.499458e+09    191310.0      BENIGN   60.0            NaN   \n",
       "91287888  1.499458e+09  20082634.0      BENIGN   60.0            NaN   \n",
       "91287889  1.499458e+09    191310.0      BENIGN   60.0  192.168.10.51   \n",
       "\n",
       "         sport_flow     dstip_flow dsport_flow  label  \n",
       "0               NaN            NaN         NaN      0  \n",
       "1               NaN            NaN         NaN      0  \n",
       "2               NaN            NaN         NaN      0  \n",
       "3               NaN            NaN         NaN      0  \n",
       "4               NaN            NaN         NaN      0  \n",
       "...             ...            ...         ...    ...  \n",
       "91287885        NaN            NaN         NaN      0  \n",
       "91287886        NaN            NaN         NaN      0  \n",
       "91287887        NaN            NaN         NaN      0  \n",
       "91287888        NaN            NaN         NaN      0  \n",
       "91287889    59581.0  162.213.33.50       443.0      0  \n",
       "\n",
       "[91287890 rows x 21 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6c457b72-fc3f-479b-a4c1-adea8e24bb59",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3364: DtypeWarning: Columns (16,18) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  if (await self.run_code(code, result,  async_=asy)):\n"
     ]
    }
   ],
   "source": [
    "chunk = 1\n",
    "chunksize = 10_000_000\n",
    "\n",
    "for main_df in pd.read_csv('CICIDS.csv', chunksize=chunksize):\n",
    "    \n",
    "    write_log(f'<<<<<<<<----- Started Processing Chunk {chunk} ----->>>>>>>>')\n",
    "    \n",
    "    if os.path.isfile(f'./CICIDS/output{chunk}.csv'):\n",
    "        write_log(f'------------ Skipping DataFrame {chunk} as CSV File already exists ------------')\n",
    "        chunk += 1\n",
    "        continue\n",
    "\n",
    "    packet_info = []\n",
    "    packet_details = []\n",
    "    log_records = 100000\n",
    "\n",
    "    count = 0\n",
    "    for i in range(len(main_df)):\n",
    "        packet_type = main_df.iloc[i,8]\n",
    "        packet_bytes = bytes.fromhex(main_df.iloc[i,9])\n",
    "\n",
    "        if packet_type == 'cooked linux':\n",
    "            packet = CookedLinux(packet_bytes)\n",
    "        elif packet_type == 'Ethernet':\n",
    "            packet = Ether(packet_bytes)\n",
    "        else:\n",
    "            print('Error -> First Layer is not valid')\n",
    "        details = packet.show(dump=True)\n",
    "        packet_info.append(details)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packets Parsed: {count}')\n",
    "\n",
    "    write_log(f'------------ All Packets Parsed Successfully for Chunk {chunk} ------------')\n",
    "\n",
    "    count = 0\n",
    "    for packet in packet_info:\n",
    "        fields_values = {}\n",
    "        current_layer = \"\"\n",
    "        for line in packet.split(\"\\n\"):\n",
    "            if line.startswith(\"###[\") and \"]\" in line:\n",
    "                current_layer = line.split(\"]\")[0].split(\"[\")[1].strip()\n",
    "                fields_values[current_layer] = {}\n",
    "            elif current_layer != \"\":\n",
    "                matches = re.findall(r\"\\s+([a-z_]+)\\s+=\\s+(.+)\", line)\n",
    "                for match in matches:\n",
    "                    field_name = match[0]\n",
    "                    field_value = match[1]\n",
    "                    fields_values[current_layer][field_name] = field_value\n",
    "        packet_details.append(fields_values)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packet Fields Parsed: {count}')\n",
    "\n",
    "    write_log(f'------------ All Packet Fields Parsed Successfully for Chunk {chunk} ------------')\n",
    "\n",
    "    count = 0\n",
    "    df_list = []\n",
    "    for fields_values in packet_details:\n",
    "        row = {}\n",
    "        for layer, fields in fields_values.items():\n",
    "            for field in fields:\n",
    "                column_name = f\"{layer} {field}\"\n",
    "                row[column_name] = fields_values[layer][field]\n",
    "        df_list.append(row)\n",
    "        count += 1\n",
    "        if count%log_records == 0:\n",
    "          write_log(f'Packets appended to DataFrame: {count}')\n",
    "\n",
    "    df = pd.DataFrame(df_list)\n",
    "\n",
    "    write_log(f'------------ All Packets appended to the DataFrame {chunk} ------------')\n",
    "\n",
    "    df = pd.concat([main_df, df], axis=1)\n",
    "    df.to_csv(f'./CICIDS/output{chunk}.csv', index=False)\n",
    "\n",
    "    write_log(f'------------ DataFrame {chunk} saved to CSV File ------------')\n",
    "    \n",
    "    chunk += 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36e69de0-58dc-4ba9-ac26-8803122dc0ca",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Process CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b14d149f-caee-4cc0-ad33-87c1a73b9372",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_copy = df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9cfd581a-2901-4710-a1c1-4b8cfb5a7daa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>ARP ptype</th>\n",
       "      <th>ARP hwlen</th>\n",
       "      <th>ARP plen</th>\n",
       "      <th>ARP op</th>\n",
       "      <th>ARP hwsrc</th>\n",
       "      <th>ARP psrc</th>\n",
       "      <th>ARP hwdst</th>\n",
       "      <th>ARP pdst</th>\n",
       "      <th>Kerberos options</th>\n",
       "      <th>Kerberos address</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287885</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee0800450000284980400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287886</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba808004500003cd6ba400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287887</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba8080045000034d6bb400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287888</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>10399.0</td>\n",
       "      <td>184.84.243.218</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000297740400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91287889</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>49.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f360ba800c1b114eb31080045280034903b400031...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2575780 rows × 176 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 stime          srcip    sport           dstip   dsport  \\\n",
       "0                  NaN            NaN      NaN             NaN      NaN   \n",
       "1                  NaN            NaN      NaN             NaN      NaN   \n",
       "2                  NaN            NaN      NaN             NaN      NaN   \n",
       "3                  NaN            NaN      NaN             NaN      NaN   \n",
       "4                  NaN            NaN      NaN             NaN      NaN   \n",
       "...                ...            ...      ...             ...      ...   \n",
       "91287885  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "91287886  1.499458e+09  192.168.10.51  59581.0   162.213.33.50    443.0   \n",
       "91287887  1.499458e+09  192.168.10.51  59581.0   162.213.33.50    443.0   \n",
       "91287888  1.499458e+09   192.168.10.8  10399.0  184.84.243.218     80.0   \n",
       "91287889  1.499458e+09  162.213.33.50    443.0   192.168.10.51  59581.0   \n",
       "\n",
       "         protocol_m   sttl  total_len first_layer  \\\n",
       "0               NaN    NaN        NaN         NaN   \n",
       "1               NaN    NaN        NaN         NaN   \n",
       "2               NaN    NaN        NaN         NaN   \n",
       "3               NaN    NaN        NaN         NaN   \n",
       "4               NaN    NaN        NaN         NaN   \n",
       "...             ...    ...        ...         ...   \n",
       "91287885        tcp  128.0       40.0    Ethernet   \n",
       "91287886        tcp   64.0       60.0    Ethernet   \n",
       "91287887        tcp   64.0       52.0    Ethernet   \n",
       "91287888        tcp  128.0       41.0    Ethernet   \n",
       "91287889        tcp   49.0       52.0    Ethernet   \n",
       "\n",
       "                                                     packet  ... ARP ptype  \\\n",
       "0                                                       NaN  ...       NaN   \n",
       "1                                                       NaN  ...       NaN   \n",
       "2                                                       NaN  ...       NaN   \n",
       "3                                                       NaN  ...       NaN   \n",
       "4                                                       NaN  ...       NaN   \n",
       "...                                                     ...  ...       ...   \n",
       "91287885  00c1b114eb31b8ac6f3607ee0800450000284980400080...  ...       NaN   \n",
       "91287886  00c1b114eb31b8ac6f360ba808004500003cd6ba400040...  ...       NaN   \n",
       "91287887  00c1b114eb31b8ac6f360ba8080045000034d6bb400040...  ...       NaN   \n",
       "91287888  00c1b114eb31b8ac6f3608f50800450000297740400080...  ...       NaN   \n",
       "91287889  b8ac6f360ba800c1b114eb31080045280034903b400031...  ...       NaN   \n",
       "\n",
       "          ARP hwlen  ARP plen  ARP op ARP hwsrc  ARP psrc ARP hwdst  ARP pdst  \\\n",
       "0               NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1               NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "2               NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "3               NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "4               NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "...             ...       ...     ...       ...       ...       ...       ...   \n",
       "91287885        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "91287886        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "91287887        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "91287888        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "91287889        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "\n",
       "         Kerberos options  Kerberos address  \n",
       "0                     NaN               NaN  \n",
       "1                     NaN               NaN  \n",
       "2                     NaN               NaN  \n",
       "3                     NaN               NaN  \n",
       "4                     NaN               NaN  \n",
       "...                   ...               ...  \n",
       "91287885              NaN               NaN  \n",
       "91287886              NaN               NaN  \n",
       "91287887              NaN               NaN  \n",
       "91287888              NaN               NaN  \n",
       "91287889              NaN               NaN  \n",
       "\n",
       "[2575780 rows x 176 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "bdbacf60-4a84-4bee-a8c1-6fa50cb5ffe2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (1,3,5,8,9,10,14,16,18,21,22,23,26,29,32,33,34,35,36,37,42,44,46,47,48,49,51,54,62,67,68,69,70,71,72,74,75,76,77,78,81,82,88,89,91,96,97,98,99,100,101,102,103,104,105,106,107,111,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,157,161,164,165,166,169,170,171,172,173,174,175) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "for i in range(10,11):\n",
    "    if not os.path.isfile(f'./CICIDS/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')\n",
    "    df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
    "    if df.shape[0]>10_000_000 or i==10:\n",
    "        write_log(f'------------ Misaligned CSV File {i} Processing ------------')\n",
    "        rows = df.shape[0]//2\n",
    "        df1 = df.iloc[rows:,:21]\n",
    "        df2 = df.iloc[:rows,21:]\n",
    "        df = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)\n",
    "    df.drop_duplicates(inplace=True)\n",
    "    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')\n",
    "    df.to_csv(f'./CICIDS/output{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Overwritten and Saved ------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c32aa879-41e8-45b6-b27d-5d39cd37a9e6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,47,48,49,51,52,54,59,60,61,62,63,67,75,80,81,82,83,84,85,86,87,90,91,93,94,95,96,97,98,99,100,101,102,103,104,105,106,108,109,110,111,112,113,114,115,116,122,123,124,125,126,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,154,157,158,159,161,162,163,167,174,175,176,179,180,181,182,183,186,190,195,196,197,198,199,200,203,204,206,207,208,209,210,211,212,213,214,215,216,217,219,220,221,222,225,226,227,228,230,232,234,235,237) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(f'./CICIDS/output9.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "f0cf992e-9ffa-45ed-a831-29ac5e57eced",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>ARP ptype</th>\n",
       "      <th>ARP hwlen</th>\n",
       "      <th>ARP plen</th>\n",
       "      <th>ARP op</th>\n",
       "      <th>ARP hwsrc</th>\n",
       "      <th>ARP psrc</th>\n",
       "      <th>ARP hwdst</th>\n",
       "      <th>ARP pdst</th>\n",
       "      <th>Kerberos options</th>\n",
       "      <th>Kerberos address</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499455e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>60079.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2875.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f1080045000b3bdf12400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499455e+09</td>\n",
       "      <td>192.168.10.3</td>\n",
       "      <td>53.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>51981.0</td>\n",
       "      <td>udp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>117.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca281866da9be37d0800450000752396000080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499455e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>60079.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>8800.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f1080045002260df0c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499455e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>35678.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>8715.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f108004500220b14cf400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499455e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>35678.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f1080045000b9014cd400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287885</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee0800450000284980400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287886</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba808004500003cd6ba400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287887</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba8080045000034d6bb400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287888</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>10399.0</td>\n",
       "      <td>184.84.243.218</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000297740400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287889</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>49.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f360ba800c1b114eb31080045280034903b400031...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1287890 rows × 176 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                stime          srcip    sport           dstip   dsport  \\\n",
       "0        1.499455e+09  192.168.10.50     80.0      172.16.0.1  60079.0   \n",
       "1        1.499455e+09   192.168.10.3     53.0   192.168.10.15  51981.0   \n",
       "2        1.499455e+09  192.168.10.50     80.0      172.16.0.1  60079.0   \n",
       "3        1.499455e+09  192.168.10.50     80.0      172.16.0.1  35678.0   \n",
       "4        1.499455e+09  192.168.10.50     80.0      172.16.0.1  35678.0   \n",
       "...               ...            ...      ...             ...      ...   \n",
       "1287885  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "1287886  1.499458e+09  192.168.10.51  59581.0   162.213.33.50    443.0   \n",
       "1287887  1.499458e+09  192.168.10.51  59581.0   162.213.33.50    443.0   \n",
       "1287888  1.499458e+09   192.168.10.8  10399.0  184.84.243.218     80.0   \n",
       "1287889  1.499458e+09  162.213.33.50    443.0   192.168.10.51  59581.0   \n",
       "\n",
       "        protocol_m   sttl  total_len first_layer  \\\n",
       "0              tcp   64.0     2875.0    Ethernet   \n",
       "1              udp  128.0      117.0    Ethernet   \n",
       "2              tcp   64.0     8800.0    Ethernet   \n",
       "3              tcp   64.0     8715.0    Ethernet   \n",
       "4              tcp   64.0     2960.0    Ethernet   \n",
       "...            ...    ...        ...         ...   \n",
       "1287885        tcp  128.0       40.0    Ethernet   \n",
       "1287886        tcp   64.0       60.0    Ethernet   \n",
       "1287887        tcp   64.0       52.0    Ethernet   \n",
       "1287888        tcp  128.0       41.0    Ethernet   \n",
       "1287889        tcp   49.0       52.0    Ethernet   \n",
       "\n",
       "                                                    packet  ... ARP ptype  \\\n",
       "0        00c1b114eb310019b90a69f1080045000b3bdf12400040...  ...       NaN   \n",
       "1        001e4fd4ca281866da9be37d0800450000752396000080...  ...       NaN   \n",
       "2        00c1b114eb310019b90a69f1080045002260df0c400040...  ...       NaN   \n",
       "3        00c1b114eb310019b90a69f108004500220b14cf400040...  ...       NaN   \n",
       "4        00c1b114eb310019b90a69f1080045000b9014cd400040...  ...       NaN   \n",
       "...                                                    ...  ...       ...   \n",
       "1287885  00c1b114eb31b8ac6f3607ee0800450000284980400080...  ...       NaN   \n",
       "1287886  00c1b114eb31b8ac6f360ba808004500003cd6ba400040...  ...       NaN   \n",
       "1287887  00c1b114eb31b8ac6f360ba8080045000034d6bb400040...  ...       NaN   \n",
       "1287888  00c1b114eb31b8ac6f3608f50800450000297740400080...  ...       NaN   \n",
       "1287889  b8ac6f360ba800c1b114eb31080045280034903b400031...  ...       NaN   \n",
       "\n",
       "         ARP hwlen  ARP plen  ARP op ARP hwsrc  ARP psrc ARP hwdst  ARP pdst  \\\n",
       "0              NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1              NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "2              NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "3              NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "4              NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "...            ...       ...     ...       ...       ...       ...       ...   \n",
       "1287885        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1287886        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1287887        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1287888        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "1287889        NaN       NaN     NaN       NaN       NaN       NaN       NaN   \n",
       "\n",
       "        Kerberos options  Kerberos address  \n",
       "0                    NaN               NaN  \n",
       "1                    NaN               NaN  \n",
       "2                    NaN               NaN  \n",
       "3                    NaN               NaN  \n",
       "4                    NaN               NaN  \n",
       "...                  ...               ...  \n",
       "1287885              NaN               NaN  \n",
       "1287886              NaN               NaN  \n",
       "1287887              NaN               NaN  \n",
       "1287888              NaN               NaN  \n",
       "1287889              NaN               NaN  \n",
       "\n",
       "[1287890 rows x 176 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "93d82d2c-f1de-4470-b4f4-f89ec9b25337",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "stime                                                 1499454694.0\n",
       "srcip                                                192.168.10.50\n",
       "sport                                                         80.0\n",
       "dstip                                                   172.16.0.1\n",
       "dsport                                                     60079.0\n",
       "protocol_m                                                     tcp\n",
       "sttl                                                          64.0\n",
       "total_len                                                   2875.0\n",
       "first_layer                                               Ethernet\n",
       "packet           00c1b114eb310019b90a69f1080045000b3bdf12400040...\n",
       "payload          6d616e2e6367693f71756572793d613264697373697465...\n",
       "t_delta                                                        0.0\n",
       "stime_flow                                            1499454660.0\n",
       "duration                                                  581642.0\n",
       "attack_cat                                                    DDoS\n",
       "offset                                                        60.0\n",
       "srcip_flow                                              172.16.0.1\n",
       "sport_flow                                                 60079.0\n",
       "dstip_flow                                           192.168.10.50\n",
       "dsport_flow                                                   80.0\n",
       "label                                                          1.0\n",
       "Ethernet dst                                     00:c1:b1:14:eb:31\n",
       "Ethernet src                                     00:19:b9:0a:69:f1\n",
       "Ethernet type                                                 IPv4\n",
       "IP version                                                     4.0\n",
       "IP ihl                                                         5.0\n",
       "IP tos                                                         0x0\n",
       "IP len                                                      2875.0\n",
       "IP id                                                      57106.0\n",
       "IP flags                                                        DF\n",
       "IP frag                                                        0.0\n",
       "IP ttl                                                        64.0\n",
       "IP proto                                                       tcp\n",
       "IP chksum                                                   0xd9be\n",
       "IP src                                               192.168.10.50\n",
       "IP dst                                                  172.16.0.1\n",
       "TCP sport                                                     http\n",
       "TCP dport                                                    60079\n",
       "TCP seq                                               3624104692.0\n",
       "TCP ack                                                747123647.0\n",
       "TCP dataofs                                                    5.0\n",
       "TCP reserved                                                   0.0\n",
       "TCP flags                                                       PA\n",
       "TCP window                                                   229.0\n",
       "TCP chksum                                                  0x8219\n",
       "TCP urgptr                                                     0.0\n",
       "TCP options                                                     []\n",
       "Raw load         'man.cgi?query=a2dissite\">a2dissite</a>,\\n    ...\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[0].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "9cd1290b-474d-46a6-ac11-1b51e68e85d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TCP flags</th>\n",
       "      <th>TCP window</th>\n",
       "      <th>TCP chksum</th>\n",
       "      <th>TCP urgptr</th>\n",
       "      <th>TCP options</th>\n",
       "      <th>Raw load</th>\n",
       "      <th>UDP sport</th>\n",
       "      <th>UDP dport</th>\n",
       "      <th>UDP len</th>\n",
       "      <th>UDP chksum</th>\n",
       "      <th>...</th>\n",
       "      <th>ARP ptype</th>\n",
       "      <th>ARP hwlen</th>\n",
       "      <th>ARP plen</th>\n",
       "      <th>ARP op</th>\n",
       "      <th>ARP hwsrc</th>\n",
       "      <th>ARP psrc</th>\n",
       "      <th>ARP hwdst</th>\n",
       "      <th>ARP pdst</th>\n",
       "      <th>Kerberos options</th>\n",
       "      <th>Kerberos address</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>PA</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0x8219</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'man.cgi?query=a2dissite\"&gt;a2dissite&lt;/a&gt;,\\n    ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>domain</td>\n",
       "      <td>51981</td>\n",
       "      <td>97.0</td>\n",
       "      <td>0x2c5f</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0x993e</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'HTTP/1.1 200 OK\\r\\nDate: Fri, 07 Jul 2017 19:...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>PA</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0x98e9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'ng_element {\\n    position: relative;\\n    fl...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0x826e</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'HTTP/1.1 200 OK\\r\\nDate: Fri, 07 Jul 2017 19:...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287885</th>\n",
       "      <td>FA</td>\n",
       "      <td>256.0</td>\n",
       "      <td>0x7d89</td>\n",
       "      <td>0.0</td>\n",
       "      <td>''</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287886</th>\n",
       "      <td>S</td>\n",
       "      <td>29200.0</td>\n",
       "      <td>0xd09d</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('MSS', 1460), ('SAckOK', b''), ('Timestamp',...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287887</th>\n",
       "      <td>A</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0x3b49</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287888</th>\n",
       "      <td>A</td>\n",
       "      <td>16329.0</td>\n",
       "      <td>0x4de2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[]</td>\n",
       "      <td>'\\x00'</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287889</th>\n",
       "      <td>A</td>\n",
       "      <td>235.0</td>\n",
       "      <td>0x3a96</td>\n",
       "      <td>0.0</td>\n",
       "      <td>[('NOP', None), ('NOP', None), ('Timestamp', (...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1287890 rows × 134 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        TCP flags  TCP window TCP chksum  TCP urgptr  \\\n",
       "0              PA       229.0     0x8219         0.0   \n",
       "1             NaN         NaN        NaN         NaN   \n",
       "2               A       229.0     0x993e         0.0   \n",
       "3              PA       229.0     0x98e9         0.0   \n",
       "4               A       229.0     0x826e         0.0   \n",
       "...           ...         ...        ...         ...   \n",
       "1287885        FA       256.0     0x7d89         0.0   \n",
       "1287886         S     29200.0     0xd09d         0.0   \n",
       "1287887         A       229.0     0x3b49         0.0   \n",
       "1287888         A     16329.0     0x4de2         0.0   \n",
       "1287889         A       235.0     0x3a96         0.0   \n",
       "\n",
       "                                               TCP options  \\\n",
       "0                                                       []   \n",
       "1                                                      NaN   \n",
       "2                                                       []   \n",
       "3                                                       []   \n",
       "4                                                       []   \n",
       "...                                                    ...   \n",
       "1287885                                                 ''   \n",
       "1287886  [('MSS', 1460), ('SAckOK', b''), ('Timestamp',...   \n",
       "1287887  [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "1287888                                                 []   \n",
       "1287889  [('NOP', None), ('NOP', None), ('Timestamp', (...   \n",
       "\n",
       "                                                  Raw load UDP sport  \\\n",
       "0        'man.cgi?query=a2dissite\">a2dissite</a>,\\n    ...       NaN   \n",
       "1                                                      NaN    domain   \n",
       "2        'HTTP/1.1 200 OK\\r\\nDate: Fri, 07 Jul 2017 19:...       NaN   \n",
       "3        'ng_element {\\n    position: relative;\\n    fl...       NaN   \n",
       "4        'HTTP/1.1 200 OK\\r\\nDate: Fri, 07 Jul 2017 19:...       NaN   \n",
       "...                                                    ...       ...   \n",
       "1287885                                                NaN       NaN   \n",
       "1287886                                                NaN       NaN   \n",
       "1287887                                                NaN       NaN   \n",
       "1287888                                             '\\x00'       NaN   \n",
       "1287889                                                NaN       NaN   \n",
       "\n",
       "        UDP dport  UDP len UDP chksum  ...  ARP ptype  ARP hwlen ARP plen  \\\n",
       "0             NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "1           51981     97.0     0x2c5f  ...        NaN        NaN      NaN   \n",
       "2             NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "3             NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "4             NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "...           ...      ...        ...  ...        ...        ...      ...   \n",
       "1287885       NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "1287886       NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "1287887       NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "1287888       NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "1287889       NaN      NaN        NaN  ...        NaN        NaN      NaN   \n",
       "\n",
       "         ARP op  ARP hwsrc  ARP psrc  ARP hwdst  ARP pdst  Kerberos options  \\\n",
       "0           NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "1           NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "2           NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "3           NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "4           NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "...         ...        ...       ...        ...       ...               ...   \n",
       "1287885     NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "1287886     NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "1287887     NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "1287888     NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "1287889     NaN        NaN       NaN        NaN       NaN               NaN   \n",
       "\n",
       "         Kerberos address  \n",
       "0                     NaN  \n",
       "1                     NaN  \n",
       "2                     NaN  \n",
       "3                     NaN  \n",
       "4                     NaN  \n",
       "...                   ...  \n",
       "1287885               NaN  \n",
       "1287886               NaN  \n",
       "1287887               NaN  \n",
       "1287888               NaN  \n",
       "1287889               NaN  \n",
       "\n",
       "[1287890 rows x 134 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "3051d8e7-fe64-463f-8bef-906ac656d101",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ethernet dst</th>\n",
       "      <th>Ethernet src</th>\n",
       "      <th>Ethernet type</th>\n",
       "      <th>IP version</th>\n",
       "      <th>IP ihl</th>\n",
       "      <th>IP tos</th>\n",
       "      <th>IP len</th>\n",
       "      <th>IP id</th>\n",
       "      <th>IP flags</th>\n",
       "      <th>IP frag</th>\n",
       "      <th>...</th>\n",
       "      <th>IP proto</th>\n",
       "      <th>IP chksum</th>\n",
       "      <th>IP src</th>\n",
       "      <th>IP dst</th>\n",
       "      <th>TCP sport</th>\n",
       "      <th>TCP dport</th>\n",
       "      <th>TCP seq</th>\n",
       "      <th>TCP ack</th>\n",
       "      <th>TCP dataofs</th>\n",
       "      <th>TCP reserved</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1287890</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287891</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287892</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287893</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287894</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2575775</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2575776</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2575777</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2575778</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2575779</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1287890 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Ethernet dst Ethernet src Ethernet type  IP version  IP ihl IP tos  \\\n",
       "1287890          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287891          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287892          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287893          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287894          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "...              ...          ...           ...         ...     ...    ...   \n",
       "2575775          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "2575776          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "2575777          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "2575778          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "2575779          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "\n",
       "         IP len  IP id IP flags  IP frag  ...  IP proto IP chksum IP src  \\\n",
       "1287890     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "1287891     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "1287892     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "1287893     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "1287894     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "...         ...    ...      ...      ...  ...       ...       ...    ...   \n",
       "2575775     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "2575776     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "2575777     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "2575778     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "2575779     NaN    NaN      NaN      NaN  ...       NaN       NaN    NaN   \n",
       "\n",
       "        IP dst TCP sport TCP dport TCP seq  TCP ack  TCP dataofs  TCP reserved  \n",
       "1287890    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "1287891    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "1287892    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "1287893    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "1287894    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "...        ...       ...       ...     ...      ...          ...           ...  \n",
       "2575775    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "2575776    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "2575777    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "2575778    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "2575779    NaN       NaN       NaN     NaN      NaN          NaN           NaN  \n",
       "\n",
       "[1287890 rows x 21 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "aa6ca421-59d9-459a-bf78-d764911247ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1287890"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.shape[0]//2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "93209fd3-7b92-4a16-880f-8c7fd119fb24",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ethernet dst</th>\n",
       "      <th>Ethernet src</th>\n",
       "      <th>Ethernet type</th>\n",
       "      <th>IP version</th>\n",
       "      <th>IP ihl</th>\n",
       "      <th>IP tos</th>\n",
       "      <th>IP len</th>\n",
       "      <th>IP id</th>\n",
       "      <th>IP flags</th>\n",
       "      <th>IP frag</th>\n",
       "      <th>...</th>\n",
       "      <th>ARP ptype</th>\n",
       "      <th>ARP hwlen</th>\n",
       "      <th>ARP plen</th>\n",
       "      <th>ARP op</th>\n",
       "      <th>ARP hwsrc</th>\n",
       "      <th>ARP psrc</th>\n",
       "      <th>ARP hwdst</th>\n",
       "      <th>ARP pdst</th>\n",
       "      <th>Kerberos options</th>\n",
       "      <th>Kerberos address</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287876</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287884</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287886</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287887</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1287889</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>523805 rows × 155 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Ethernet dst Ethernet src Ethernet type  IP version  IP ihl IP tos  \\\n",
       "0                NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1                NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "2                NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "3                NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "4                NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "...              ...          ...           ...         ...     ...    ...   \n",
       "1287876          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287884          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287886          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287887          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "1287889          NaN          NaN           NaN         NaN     NaN    NaN   \n",
       "\n",
       "         IP len  IP id IP flags  IP frag  ...  ARP ptype ARP hwlen ARP plen  \\\n",
       "0           NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "1           NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "2           NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "3           NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "4           NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "...         ...    ...      ...      ...  ...        ...       ...      ...   \n",
       "1287876     NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "1287884     NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "1287886     NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "1287887     NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "1287889     NaN    NaN      NaN      NaN  ...        NaN       NaN      NaN   \n",
       "\n",
       "        ARP op ARP hwsrc ARP psrc ARP hwdst  ARP pdst  Kerberos options  \\\n",
       "0          NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "1          NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "2          NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "3          NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "4          NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "...        ...       ...      ...       ...       ...               ...   \n",
       "1287876    NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "1287884    NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "1287886    NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "1287887    NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "1287889    NaN       NaN      NaN       NaN       NaN               NaN   \n",
       "\n",
       "         Kerberos address  \n",
       "0                     NaN  \n",
       "1                     NaN  \n",
       "2                     NaN  \n",
       "3                     NaN  \n",
       "4                     NaN  \n",
       "...                   ...  \n",
       "1287876               NaN  \n",
       "1287884               NaN  \n",
       "1287886               NaN  \n",
       "1287887               NaN  \n",
       "1287889               NaN  \n",
       "\n",
       "[523805 rows x 155 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "545c214f-fbdd-43ff-906a-f81176b857a8",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Sample Records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d4d25807-5939-4a85-b366-5dc425e0544a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,47,48,50,53,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,72,80,85,86,87,88,89,90,92,96,97,98,99,100,101,102,103,104,105,106,107,112,116,120,121,122,123,124,125,126,127,128,132,133,135,138,142,147,148,149,150,151,152,153,154,155,157,158,159,160,161,162,171,173,178,179,180,181,182,183,184,185,187,188,190,191,192,193,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,226,231,232,233,234,235,237,238,239,240,243) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,37,47,49,50,52,55,63,68,69,70,71,72,73,75,76,77,78,79,82,83,84,85,86,87,88,90,95,96,97,98,99,100,101,102,108,112,113,114,115,116,117,118,119,120,121,122,123,124,125,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,146,150,153,154,155,156,157,160,161,162,164,165,166,169,170,171,172,173,174,177,181,186,187,188,189,190,191,193,194,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,213,221) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (10,16,18,37,47,49,50,51,53,54,56,61,62,63,64,65,68,76,81,82,83,84,85,86,88,89,90,91,94,95,96,97,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,129,130,131,132,133,137,138,139,140,141,142,143,144,145,146,147,148,149,154,156,159,160,161,168,172,177,178,179,180,181,182,183,184,187,188,189,190,191,193,194) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,37,49,50,52,55,63,68,69,70,71,72,73,75,76,77,80,81,82,83,91,95,100,101,102,103,104,105,106,108,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,135,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,165,167,171,177,178,179,180,182,183,186,187,188,189,190,192,193,194,195,196,197,198,199,200,201,202,203,204,205,207,208,210,211) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,37,47,49,50,52,53,55,60,61,62,63,64,65,68,76,81,82,83,84,85,86,88,89,90,93,94,95,96,102,103,104,105,106,110,111,112,113,114,115,116,117,118,119,120,121,122,123,132,135,136,137,139,140,141,142,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,162,163,164,165,166,167,168,169,170,171,172,173,174,177,181,186,187,188,189,190,191) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,37,48,49,50,52,55,63,68,69,70,71,72,73,74,75,76,79,81,82,84,89,90,91,92,93,94,95,96,102,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,130,133,134,135,137,138,141,145,150,151,152,153,154,155,156,157,160,161,162,163,164,166,167,168,169,170,171,172,173,174,175,176,177,178,179,183,184,185,186,187,188,189,190,191,193,194,195,196,197,198,199,201,202,204,205,206) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,48,49,51,52,54,59,60,61,62,63,64,65,68,76,81,82,83,84,85,86,87,88,89,92,94,95,96,102,103,104,105,106,107,109,110,111,112,113,114,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,135,136,139,140,141,142,143,144,145,146,147,148,149,158,160,163,164,165,166,167,168,169,170,171,172,173,176,180,185,186,187,188,189,190,191,197,201,203,204,205,206,207,208,211,212,214) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,37,48,49,50,52,53,55,60,61,62,63,64,65,68,76,81,82,83,84,85,86,88,89,90,91,94,95,96,97,103,104,107,108,109,110,111,112,113,114,115,116,117,118,119,120,122,123,125,128,132,137,138,139,140,141,142,143,144,145,146,147,148,149,150,160,163,164,165,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,186,187,189,190,191,192,193,194,195,196,197,198,199,202,203,204,205,206,207,208,210,212,215,221,222) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (16,18,36,47,48,49,51,52,54,59,60,61,62,63,67,75,80,81,82,83,84,85,86,87,90,91,93,94,95,96,97,98,99,100,101,102,103,104,105,106,108,109,110,111,112,113,114,115,116,122,123,124,125,126,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,154,157,158,159,161,162,163,167,174,175,176,179,180,181,182,183,186,190,195,196,197,198,199,200,203,204,206,207,208,209,210,211,212,213,214,215,216,217,219,220,221,222,225,226,227,228,230,232,234,235,237) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "/opt/jupyter-hub/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (48,49,51,54,62,67,68,69,70,71,72,74,75,76,77,81,82,88,89,91,96,97,98,99,100,101,102,103,104,105,106,107,111,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,157,161,164,165,166,169,170,171,172,173,174,175) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "for i in range(0,15):\n",
    "    if not os.path.isfile(f'./CICIDS/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')\n",
    "    df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
    "    df = df[(df['protocol_m'] == 'tcp') | (df['protocol_m'] == 'udp')]\n",
    "    df = df.dropna(subset=['payload'])\n",
    "    df = df.dropna(axis=1, how='all')\n",
    "    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')\n",
    "    df.to_csv(f'./CICIDS/CICIDS-1/output{i}.csv', index=False)\n",
    "    write_log(f'------------ CSV File {i} Saved ------------')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12cc59f8-b3d7-4197-b5d3-62ef35204724",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Concatenate Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bb42b25-f22b-4adc-a121-e88b5de57aee",
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 1\n",
    "df = pd.DataFrame()\n",
    "for i in range(0,12):\n",
    "    if not os.path.isfile(f'./CICIDS/CICIDS-1/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')\n",
    "    temp_df = pd.read_csv(f'./CICIDS/CICIDS-1/output{i}.csv')\n",
    "    df = pd.concat([df, temp_df], ignore_index=True)\n",
    "    write_log(f'------------ CSV File {i} added to DataFrame ------------')\n",
    "    if (i%3 == 0 and i!=9) or i==10:\n",
    "        df.to_csv(f'./CICIDS/CICIDS-2/output{k}.csv', index=False)\n",
    "        print(df['attack_cat'].value_counts())\n",
    "        df = pd.DataFrame()\n",
    "        k = k+1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "860e25ef-5aa3-4dbd-8dc1-29b284aabf14",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Sampling DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ba40c98c-51f2-4d36-8a2c-eb5dafe33dcb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3856262/1395569476.py:1: DtypeWarning: Columns (16,18,36,37,48,49,51,52,54,59,60,61,62,63,64,65,68,76,81,82,83,84,85,86,87,88,89,92,94,95,96,102,103,104,105,106,107,109,110,111,112,113,114,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,135,136,137,138,139,140,149,151,154,155,156,157,158,159,160,161,162,163,164,167,171,176,177,178,179,180,181,182,188,191,192,193,197,198,199,201,203,206,207,208,209,210,211,212,213,214,215) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv('./CICIDS/CICIDS-2/output3.csv')\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('./CICIDS/CICIDS-2/output3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4242f8f4-cb38-4588-989f-141f227d64b7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>PPTP Start Control Connection Request host_name</th>\n",
       "      <th>Session Setup AndX Extended Security Response (SMB) pvno</th>\n",
       "      <th>Session Setup AndX Extended Security Response (SMB) etype</th>\n",
       "      <th>Session Setup AndX Extended Security Response (SMB) kvno</th>\n",
       "      <th>Session Setup AndX Extended Security Response (SMB) cipher</th>\n",
       "      <th>Session Setup AndX Extended Security Request (SMB) pvno</th>\n",
       "      <th>Session Setup AndX Extended Security Request (SMB) realm</th>\n",
       "      <th>Session Setup AndX Extended Security Request (SMB) etype</th>\n",
       "      <th>Session Setup AndX Extended Security Request (SMB) kvno</th>\n",
       "      <th>Session Setup AndX Extended Security Request (SMB) cipher</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d01400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b903d0f400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d01400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d02400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b903d0d400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787313</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee080045000028497f400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787314</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee080045000028497f400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787315</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee0800450000284980400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787316</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3607ee0800450000284980400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787317</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>10399.0</td>\n",
       "      <td>184.84.243.218</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000297740400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>25787318 rows × 216 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 stime          srcip    sport           dstip   dsport  \\\n",
       "0         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "1         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "2         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "3         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "4         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "...                ...            ...      ...             ...      ...   \n",
       "25787313  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "25787314  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "25787315  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "25787316  1.499458e+09  192.168.10.14  59111.0   23.10.108.151    443.0   \n",
       "25787317  1.499458e+09   192.168.10.8  10399.0  184.84.243.218     80.0   \n",
       "\n",
       "         protocol_m   sttl  total_len first_layer  \\\n",
       "0               tcp  118.0     1500.0    Ethernet   \n",
       "1               tcp  118.0     2960.0    Ethernet   \n",
       "2               tcp  118.0     1500.0    Ethernet   \n",
       "3               tcp  118.0     1500.0    Ethernet   \n",
       "4               tcp  118.0     2960.0    Ethernet   \n",
       "...             ...    ...        ...         ...   \n",
       "25787313        tcp  128.0       40.0    Ethernet   \n",
       "25787314        tcp  128.0       40.0    Ethernet   \n",
       "25787315        tcp  128.0       40.0    Ethernet   \n",
       "25787316        tcp  128.0       40.0    Ethernet   \n",
       "25787317        tcp  128.0       41.0    Ethernet   \n",
       "\n",
       "                                                     packet  ...  \\\n",
       "0         001e4fd4ca2800c1b114eb310800450005dc3d01400076...  ...   \n",
       "1         001e4fd4ca2800c1b114eb31080045000b903d0f400076...  ...   \n",
       "2         001e4fd4ca2800c1b114eb310800450005dc3d01400076...  ...   \n",
       "3         001e4fd4ca2800c1b114eb310800450005dc3d02400076...  ...   \n",
       "4         001e4fd4ca2800c1b114eb31080045000b903d0d400076...  ...   \n",
       "...                                                     ...  ...   \n",
       "25787313  00c1b114eb31b8ac6f3607ee080045000028497f400080...  ...   \n",
       "25787314  00c1b114eb31b8ac6f3607ee080045000028497f400080...  ...   \n",
       "25787315  00c1b114eb31b8ac6f3607ee0800450000284980400080...  ...   \n",
       "25787316  00c1b114eb31b8ac6f3607ee0800450000284980400080...  ...   \n",
       "25787317  00c1b114eb31b8ac6f3608f50800450000297740400080...  ...   \n",
       "\n",
       "         PPTP Start Control Connection Request host_name  \\\n",
       "0                                                    NaN   \n",
       "1                                                    NaN   \n",
       "2                                                    NaN   \n",
       "3                                                    NaN   \n",
       "4                                                    NaN   \n",
       "...                                                  ...   \n",
       "25787313                                             NaN   \n",
       "25787314                                             NaN   \n",
       "25787315                                             NaN   \n",
       "25787316                                             NaN   \n",
       "25787317                                             NaN   \n",
       "\n",
       "          Session Setup AndX Extended Security Response (SMB) pvno  \\\n",
       "0                                                       NaN          \n",
       "1                                                       NaN          \n",
       "2                                                       NaN          \n",
       "3                                                       NaN          \n",
       "4                                                       NaN          \n",
       "...                                                     ...          \n",
       "25787313                                                NaN          \n",
       "25787314                                                NaN          \n",
       "25787315                                                NaN          \n",
       "25787316                                                NaN          \n",
       "25787317                                                NaN          \n",
       "\n",
       "          Session Setup AndX Extended Security Response (SMB) etype  \\\n",
       "0                                                       NaN           \n",
       "1                                                       NaN           \n",
       "2                                                       NaN           \n",
       "3                                                       NaN           \n",
       "4                                                       NaN           \n",
       "...                                                     ...           \n",
       "25787313                                                NaN           \n",
       "25787314                                                NaN           \n",
       "25787315                                                NaN           \n",
       "25787316                                                NaN           \n",
       "25787317                                                NaN           \n",
       "\n",
       "          Session Setup AndX Extended Security Response (SMB) kvno  \\\n",
       "0                                                       NaN          \n",
       "1                                                       NaN          \n",
       "2                                                       NaN          \n",
       "3                                                       NaN          \n",
       "4                                                       NaN          \n",
       "...                                                     ...          \n",
       "25787313                                                NaN          \n",
       "25787314                                                NaN          \n",
       "25787315                                                NaN          \n",
       "25787316                                                NaN          \n",
       "25787317                                                NaN          \n",
       "\n",
       "         Session Setup AndX Extended Security Response (SMB) cipher  \\\n",
       "0                                                       NaN           \n",
       "1                                                       NaN           \n",
       "2                                                       NaN           \n",
       "3                                                       NaN           \n",
       "4                                                       NaN           \n",
       "...                                                     ...           \n",
       "25787313                                                NaN           \n",
       "25787314                                                NaN           \n",
       "25787315                                                NaN           \n",
       "25787316                                                NaN           \n",
       "25787317                                                NaN           \n",
       "\n",
       "          Session Setup AndX Extended Security Request (SMB) pvno  \\\n",
       "0                                                       NaN         \n",
       "1                                                       NaN         \n",
       "2                                                       NaN         \n",
       "3                                                       NaN         \n",
       "4                                                       NaN         \n",
       "...                                                     ...         \n",
       "25787313                                                NaN         \n",
       "25787314                                                NaN         \n",
       "25787315                                                NaN         \n",
       "25787316                                                NaN         \n",
       "25787317                                                NaN         \n",
       "\n",
       "         Session Setup AndX Extended Security Request (SMB) realm  \\\n",
       "0                                                       NaN         \n",
       "1                                                       NaN         \n",
       "2                                                       NaN         \n",
       "3                                                       NaN         \n",
       "4                                                       NaN         \n",
       "...                                                     ...         \n",
       "25787313                                                NaN         \n",
       "25787314                                                NaN         \n",
       "25787315                                                NaN         \n",
       "25787316                                                NaN         \n",
       "25787317                                                NaN         \n",
       "\n",
       "          Session Setup AndX Extended Security Request (SMB) etype  \\\n",
       "0                                                       NaN          \n",
       "1                                                       NaN          \n",
       "2                                                       NaN          \n",
       "3                                                       NaN          \n",
       "4                                                       NaN          \n",
       "...                                                     ...          \n",
       "25787313                                                NaN          \n",
       "25787314                                                NaN          \n",
       "25787315                                                NaN          \n",
       "25787316                                                NaN          \n",
       "25787317                                                NaN          \n",
       "\n",
       "         Session Setup AndX Extended Security Request (SMB) kvno  \\\n",
       "0                                                       NaN        \n",
       "1                                                       NaN        \n",
       "2                                                       NaN        \n",
       "3                                                       NaN        \n",
       "4                                                       NaN        \n",
       "...                                                     ...        \n",
       "25787313                                                NaN        \n",
       "25787314                                                NaN        \n",
       "25787315                                                NaN        \n",
       "25787316                                                NaN        \n",
       "25787317                                                NaN        \n",
       "\n",
       "          Session Setup AndX Extended Security Request (SMB) cipher  \n",
       "0                                                       NaN          \n",
       "1                                                       NaN          \n",
       "2                                                       NaN          \n",
       "3                                                       NaN          \n",
       "4                                                       NaN          \n",
       "...                                                     ...          \n",
       "25787313                                                NaN          \n",
       "25787314                                                NaN          \n",
       "25787315                                                NaN          \n",
       "25787316                                                NaN          \n",
       "25787317                                                NaN          \n",
       "\n",
       "[25787318 rows x 216 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "045d0754-cd40-482a-acba-fb6ef056071b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BENIGN              11612298\n",
       "DoS Hulk             2286837\n",
       "SSH-Patator           138639\n",
       "Heartbleed             41285\n",
       "DoS GoldenEye          36151\n",
       "DoS slowloris          25139\n",
       "DoS Slowhttptest       11641\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "49248027-cb26-4e1d-9ab2-00deaf957983",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BENIGN         15231102\n",
       "FTP-Patator      110636\n",
       "SSH-Patator       42951\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f0efbb1f-d5f8-4b8c-8297-cf2095661974",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df['protocol_m']=='tcp']\n",
    "columns_to_remove = df.columns[df.notna().sum() < 1000]\n",
    "rows_to_remove = df[df[columns_to_remove].notna().any(axis=1)].index\n",
    "df = df.drop(columns_to_remove, axis=1)\n",
    "df = df.drop(rows_to_remove)\n",
    "df = df.dropna(subset='Raw load')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "909ddd20-ca1b-49e5-97fb-6d629805d7c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 rpc_vers</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>SMB2 Negotiate Protocol Response load</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d01400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b903d0f400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d01400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3d02400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b903d0d400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787305</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>37759.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>900.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b95670019b90a69f108004500038442a9400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787310</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>54.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb31080045000047aaf3400036...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787311</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>23.10.108.151</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>59111.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>54.0</td>\n",
       "      <td>71.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb31080045000047aaf3400036...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787312</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>59581.0</td>\n",
       "      <td>162.213.33.50</td>\n",
       "      <td>443.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>200.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450000c8d6bc400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25787317</th>\n",
       "      <td>1.499458e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>10399.0</td>\n",
       "      <td>184.84.243.218</td>\n",
       "      <td>80.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>41.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000297740400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14952904 rows × 78 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 stime          srcip    sport           dstip   dsport  \\\n",
       "0         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "1         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "2         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "3         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "4         1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "...                ...            ...      ...             ...      ...   \n",
       "25787305  1.499458e+09  192.168.10.50     22.0   192.168.10.17  37759.0   \n",
       "25787310  1.499458e+09  23.10.108.151    443.0   192.168.10.14  59111.0   \n",
       "25787311  1.499458e+09  23.10.108.151    443.0   192.168.10.14  59111.0   \n",
       "25787312  1.499458e+09  192.168.10.51  59581.0   162.213.33.50    443.0   \n",
       "25787317  1.499458e+09   192.168.10.8  10399.0  184.84.243.218     80.0   \n",
       "\n",
       "         protocol_m   sttl  total_len first_layer  \\\n",
       "0               tcp  118.0     1500.0    Ethernet   \n",
       "1               tcp  118.0     2960.0    Ethernet   \n",
       "2               tcp  118.0     1500.0    Ethernet   \n",
       "3               tcp  118.0     1500.0    Ethernet   \n",
       "4               tcp  118.0     2960.0    Ethernet   \n",
       "...             ...    ...        ...         ...   \n",
       "25787305        tcp   64.0      900.0    Ethernet   \n",
       "25787310        tcp   54.0       71.0    Ethernet   \n",
       "25787311        tcp   54.0       71.0    Ethernet   \n",
       "25787312        tcp   64.0      200.0    Ethernet   \n",
       "25787317        tcp  128.0       41.0    Ethernet   \n",
       "\n",
       "                                                     packet  ...  \\\n",
       "0         001e4fd4ca2800c1b114eb310800450005dc3d01400076...  ...   \n",
       "1         001e4fd4ca2800c1b114eb31080045000b903d0f400076...  ...   \n",
       "2         001e4fd4ca2800c1b114eb310800450005dc3d01400076...  ...   \n",
       "3         001e4fd4ca2800c1b114eb310800450005dc3d02400076...  ...   \n",
       "4         001e4fd4ca2800c1b114eb31080045000b903d0d400076...  ...   \n",
       "...                                                     ...  ...   \n",
       "25787305  0023ae9b95670019b90a69f108004500038442a9400040...  ...   \n",
       "25787310  b8ac6f3607ee00c1b114eb31080045000047aaf3400036...  ...   \n",
       "25787311  b8ac6f3607ee00c1b114eb31080045000047aaf3400036...  ...   \n",
       "25787312  00c1b114eb31b8ac6f360ba80800450000c8d6bc400040...  ...   \n",
       "25787317  00c1b114eb31b8ac6f3608f50800450000297740400080...  ...   \n",
       "\n",
       "         DCE/RPC v5 rpc_vers  DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  \\\n",
       "0                        NaN               NaN                   NaN   \n",
       "1                        NaN               NaN                   NaN   \n",
       "2                        NaN               NaN                   NaN   \n",
       "3                        NaN               NaN                   NaN   \n",
       "4                        NaN               NaN                   NaN   \n",
       "...                      ...               ...                   ...   \n",
       "25787305                 NaN               NaN                   NaN   \n",
       "25787310                 NaN               NaN                   NaN   \n",
       "25787311                 NaN               NaN                   NaN   \n",
       "25787312                 NaN               NaN                   NaN   \n",
       "25787317                 NaN               NaN                   NaN   \n",
       "\n",
       "          DCE/RPC v5 endian DCE/RPC v5 encoding  DCE/RPC v5 float  \\\n",
       "0                       NaN                 NaN               NaN   \n",
       "1                       NaN                 NaN               NaN   \n",
       "2                       NaN                 NaN               NaN   \n",
       "3                       NaN                 NaN               NaN   \n",
       "4                       NaN                 NaN               NaN   \n",
       "...                     ...                 ...               ...   \n",
       "25787305                NaN                 NaN               NaN   \n",
       "25787310                NaN                 NaN               NaN   \n",
       "25787311                NaN                 NaN               NaN   \n",
       "25787312                NaN                 NaN               NaN   \n",
       "25787317                NaN                 NaN               NaN   \n",
       "\n",
       "         DCE/RPC v5 frag_len  DCE/RPC v5 auth_len DCE/RPC v5 call_id  \\\n",
       "0                        NaN                  NaN                NaN   \n",
       "1                        NaN                  NaN                NaN   \n",
       "2                        NaN                  NaN                NaN   \n",
       "3                        NaN                  NaN                NaN   \n",
       "4                        NaN                  NaN                NaN   \n",
       "...                      ...                  ...                ...   \n",
       "25787305                 NaN                  NaN                NaN   \n",
       "25787310                 NaN                  NaN                NaN   \n",
       "25787311                 NaN                  NaN                NaN   \n",
       "25787312                 NaN                  NaN                NaN   \n",
       "25787317                 NaN                  NaN                NaN   \n",
       "\n",
       "          SMB2 Negotiate Protocol Response load  \n",
       "0                                           NaN  \n",
       "1                                           NaN  \n",
       "2                                           NaN  \n",
       "3                                           NaN  \n",
       "4                                           NaN  \n",
       "...                                         ...  \n",
       "25787305                                    NaN  \n",
       "25787310                                    NaN  \n",
       "25787311                                    NaN  \n",
       "25787312                                    NaN  \n",
       "25787317                                    NaN  \n",
       "\n",
       "[14952904 rows x 78 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6074a9ae-0dc9-4d30-9b3a-8f906a80316e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BENIGN                        14235590\n",
       "DDoS                            619353\n",
       "Infiltration                     56242\n",
       "Web Attack – Brute Force         28922\n",
       "Web Attack – XSS                  6767\n",
       "Bot                               5147\n",
       "PortScan                           838\n",
       "Web Attack – Sql Injection          45\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts() # Part 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "22b3ff1e-1f41-4344-8173-e1380bac771b",
   "metadata": {},
   "outputs": [],
   "source": [
    "fdf = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "be184a53-4cbb-4e95-975f-f1e1281bb5eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "sdf = df[df['attack_cat']=='Bot']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "7169d6f6-e343-4112-9a75-b7efadc4feb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "sdf = df[df['attack_cat']=='Infiltration'].sample(50000, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "c9139a37-783b-4c04-a3ee-c0003fd3cecb",
   "metadata": {},
   "outputs": [],
   "source": [
    "fdf = pd.concat([fdf, sdf], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "dcd42de3-6389-4df7-b7e9-56a3b978de5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 rpc_vers</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>SMB2 Negotiate Protocol Response load</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499453e+09</td>\n",
       "      <td>69.4.95.11</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>58402.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>50.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc05fe400032...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b9038ad400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc6449400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b907fa1400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b90366a400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156714</th>\n",
       "      <td>1.499436e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>3051.0</td>\n",
       "      <td>205.174.165.73</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>238.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156715</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156716</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156717</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156718</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>156719 rows × 78 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime          srcip    sport           dstip   dsport  \\\n",
       "0       1.499453e+09     69.4.95.11     80.0   192.168.10.15  58402.0   \n",
       "1       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "2       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "3       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "4       1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "...              ...            ...      ...             ...      ...   \n",
       "156714  1.499436e+09   192.168.10.8   3051.0  205.174.165.73   8080.0   \n",
       "156715  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "156716  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "156717  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "156718  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "\n",
       "       protocol_m   sttl  total_len first_layer  \\\n",
       "0             tcp   50.0     1500.0    Ethernet   \n",
       "1             tcp  118.0     2960.0    Ethernet   \n",
       "2             tcp  118.0     1500.0    Ethernet   \n",
       "3             tcp  118.0     2960.0    Ethernet   \n",
       "4             tcp  118.0     2960.0    Ethernet   \n",
       "...           ...    ...        ...         ...   \n",
       "156714        tcp  128.0      238.0    Ethernet   \n",
       "156715        tcp  237.0      308.0    Ethernet   \n",
       "156716        tcp  237.0      308.0    Ethernet   \n",
       "156717        tcp   64.0      465.0    Ethernet   \n",
       "156718        tcp   64.0      465.0    Ethernet   \n",
       "\n",
       "                                                   packet  ...  \\\n",
       "0       001e4fd4ca2800c1b114eb310800450005dc05fe400032...  ...   \n",
       "1       001e4fd4ca2800c1b114eb31080045000b9038ad400076...  ...   \n",
       "2       001e4fd4ca2800c1b114eb310800450005dc6449400076...  ...   \n",
       "3       001e4fd4ca2800c1b114eb31080045000b907fa1400076...  ...   \n",
       "4       001e4fd4ca2800c1b114eb31080045000b90366a400076...  ...   \n",
       "...                                                   ...  ...   \n",
       "156714  00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...  ...   \n",
       "156715  0023ae9b956700c1b114eb31080045000134db614000ed...  ...   \n",
       "156716  0023ae9b956700c1b114eb31080045000134db614000ed...  ...   \n",
       "156717  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...   \n",
       "156718  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...   \n",
       "\n",
       "       DCE/RPC v5 rpc_vers  DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  \\\n",
       "0                      NaN               NaN                   NaN   \n",
       "1                      NaN               NaN                   NaN   \n",
       "2                      NaN               NaN                   NaN   \n",
       "3                      NaN               NaN                   NaN   \n",
       "4                      NaN               NaN                   NaN   \n",
       "...                    ...               ...                   ...   \n",
       "156714                 NaN               NaN                   NaN   \n",
       "156715                 NaN               NaN                   NaN   \n",
       "156716                 NaN               NaN                   NaN   \n",
       "156717                 NaN               NaN                   NaN   \n",
       "156718                 NaN               NaN                   NaN   \n",
       "\n",
       "        DCE/RPC v5 endian DCE/RPC v5 encoding  DCE/RPC v5 float  \\\n",
       "0                     NaN                 NaN               NaN   \n",
       "1                     NaN                 NaN               NaN   \n",
       "2                     NaN                 NaN               NaN   \n",
       "3                     NaN                 NaN               NaN   \n",
       "4                     NaN                 NaN               NaN   \n",
       "...                   ...                 ...               ...   \n",
       "156714                NaN                 NaN               NaN   \n",
       "156715                NaN                 NaN               NaN   \n",
       "156716                NaN                 NaN               NaN   \n",
       "156717                NaN                 NaN               NaN   \n",
       "156718                NaN                 NaN               NaN   \n",
       "\n",
       "       DCE/RPC v5 frag_len  DCE/RPC v5 auth_len DCE/RPC v5 call_id  \\\n",
       "0                      NaN                  NaN                NaN   \n",
       "1                      NaN                  NaN                NaN   \n",
       "2                      NaN                  NaN                NaN   \n",
       "3                      NaN                  NaN                NaN   \n",
       "4                      NaN                  NaN                NaN   \n",
       "...                    ...                  ...                ...   \n",
       "156714                 NaN                  NaN                NaN   \n",
       "156715                 NaN                  NaN                NaN   \n",
       "156716                 NaN                  NaN                NaN   \n",
       "156717                 NaN                  NaN                NaN   \n",
       "156718                 NaN                  NaN                NaN   \n",
       "\n",
       "        SMB2 Negotiate Protocol Response load  \n",
       "0                                         NaN  \n",
       "1                                         NaN  \n",
       "2                                         NaN  \n",
       "3                                         NaN  \n",
       "4                                         NaN  \n",
       "...                                       ...  \n",
       "156714                                    NaN  \n",
       "156715                                    NaN  \n",
       "156716                                    NaN  \n",
       "156717                                    NaN  \n",
       "156718                                    NaN  \n",
       "\n",
       "[156719 rows x 78 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "27c6003f-bb81-4e16-aa09-aadb7028c700",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DDoS                          50000\n",
       "Infiltration                  50000\n",
       "Web Attack – Brute Force      28922\n",
       "BENIGN                        15000\n",
       "Web Attack – XSS               6767\n",
       "Bot                            5147\n",
       "PortScan                        838\n",
       "Web Attack – Sql Injection       45\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fdf['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "9dbe7398-3911-4806-adaa-b9f2b150f96d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fdf.to_csv(f'./CICIDS/CICIDS-3/output3.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4a18857-dbd8-431f-aa52-42d31c2f070d",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# Combining CSV Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "112f3eff-94a6-4c15-af92-416f04071590",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3856262/2729298540.py:1: DtypeWarning: Columns (46) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df1 = pd.read_csv('./CICIDS/CICIDS-3/output1.csv')\n"
     ]
    }
   ],
   "source": [
    "df1 = pd.read_csv('./CICIDS/CICIDS-3/output1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "88cd923c-09ce-40d9-95ac-db896320f933",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>LDAP referral</th>\n",
       "      <th>Kerberos from_</th>\n",
       "      <th>Kerberos till</th>\n",
       "      <th>Kerberos rtime</th>\n",
       "      <th>Kerberos nonce</th>\n",
       "      <th>LDAP scope</th>\n",
       "      <th>LDAP present</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499171e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>50580.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc2d27400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499096e+09</td>\n",
       "      <td>35.186.253.0</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.25</td>\n",
       "      <td>55458.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1638.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>002500a8c46000c1b114eb31080045000666586800002d...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499085e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49647.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc45f6400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499173e+09</td>\n",
       "      <td>8.253.104.14</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>51929.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>54.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dcaec5000036...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499171e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>50128.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>119.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb31080045000b905f18400077...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84995</th>\n",
       "      <td>1.499189e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>47108.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f1080045000074cc4c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84996</th>\n",
       "      <td>1.499189e+09</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>47162.0</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>62.0</td>\n",
       "      <td>164.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0019b90a69f100c1b114eb310800450000a4a9a340003e...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84997</th>\n",
       "      <td>1.499188e+09</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>46588.0</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>62.0</td>\n",
       "      <td>324.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0019b90a69f100c1b114eb31080045000144d14d40003e...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84998</th>\n",
       "      <td>1.499188e+09</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>46842.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>93.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310019b90a69f108004500005dc135400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84999</th>\n",
       "      <td>1.499188e+09</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>46578.0</td>\n",
       "      <td>192.168.10.50</td>\n",
       "      <td>22.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>62.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0019b90a69f100c1b114eb31080045000074cf2740003e...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>85000 rows × 76 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              stime          srcip    sport          dstip   dsport  \\\n",
       "0      1.499171e+09    13.107.4.50     80.0  192.168.10.15  50580.0   \n",
       "1      1.499096e+09   35.186.253.0    443.0  192.168.10.25  55458.0   \n",
       "2      1.499085e+09    13.107.4.50     80.0  192.168.10.15  49647.0   \n",
       "3      1.499173e+09   8.253.104.14     80.0  192.168.10.15  51929.0   \n",
       "4      1.499171e+09    13.107.4.50     80.0  192.168.10.14  50128.0   \n",
       "...             ...            ...      ...            ...      ...   \n",
       "84995  1.499189e+09  192.168.10.50     22.0     172.16.0.1  47108.0   \n",
       "84996  1.499189e+09     172.16.0.1  47162.0  192.168.10.50     22.0   \n",
       "84997  1.499188e+09     172.16.0.1  46588.0  192.168.10.50     22.0   \n",
       "84998  1.499188e+09  192.168.10.50     22.0     172.16.0.1  46842.0   \n",
       "84999  1.499188e+09     172.16.0.1  46578.0  192.168.10.50     22.0   \n",
       "\n",
       "      protocol_m   sttl  total_len first_layer  \\\n",
       "0            tcp  118.0     1500.0    Ethernet   \n",
       "1            tcp   45.0     1638.0    Ethernet   \n",
       "2            tcp  118.0     1500.0    Ethernet   \n",
       "3            tcp   54.0     1500.0    Ethernet   \n",
       "4            tcp  119.0     2960.0    Ethernet   \n",
       "...          ...    ...        ...         ...   \n",
       "84995        tcp   64.0      116.0    Ethernet   \n",
       "84996        tcp   62.0      164.0    Ethernet   \n",
       "84997        tcp   62.0      324.0    Ethernet   \n",
       "84998        tcp   64.0       93.0    Ethernet   \n",
       "84999        tcp   62.0      116.0    Ethernet   \n",
       "\n",
       "                                                  packet  ...  \\\n",
       "0      001e4fd4ca2800c1b114eb310800450005dc2d27400076...  ...   \n",
       "1      002500a8c46000c1b114eb31080045000666586800002d...  ...   \n",
       "2      001e4fd4ca2800c1b114eb310800450005dc45f6400076...  ...   \n",
       "3      001e4fd4ca2800c1b114eb310800450005dcaec5000036...  ...   \n",
       "4      b8ac6f3607ee00c1b114eb31080045000b905f18400077...  ...   \n",
       "...                                                  ...  ...   \n",
       "84995  00c1b114eb310019b90a69f1080045000074cc4c400040...  ...   \n",
       "84996  0019b90a69f100c1b114eb310800450000a4a9a340003e...  ...   \n",
       "84997  0019b90a69f100c1b114eb31080045000144d14d40003e...  ...   \n",
       "84998  00c1b114eb310019b90a69f108004500005dc135400040...  ...   \n",
       "84999  0019b90a69f100c1b114eb31080045000074cf2740003e...  ...   \n",
       "\n",
       "      DCE/RPC v5 frag_len  DCE/RPC v5 auth_len  DCE/RPC v5 call_id  \\\n",
       "0                     NaN                  NaN                 NaN   \n",
       "1                     NaN                  NaN                 NaN   \n",
       "2                     NaN                  NaN                 NaN   \n",
       "3                     NaN                  NaN                 NaN   \n",
       "4                     NaN                  NaN                 NaN   \n",
       "...                   ...                  ...                 ...   \n",
       "84995                 NaN                  NaN                 NaN   \n",
       "84996                 NaN                  NaN                 NaN   \n",
       "84997                 NaN                  NaN                 NaN   \n",
       "84998                 NaN                  NaN                 NaN   \n",
       "84999                 NaN                  NaN                 NaN   \n",
       "\n",
       "       LDAP referral Kerberos from_  Kerberos till Kerberos rtime  \\\n",
       "0                NaN            NaN            NaN            NaN   \n",
       "1                NaN            NaN            NaN            NaN   \n",
       "2                NaN            NaN            NaN            NaN   \n",
       "3                NaN            NaN            NaN            NaN   \n",
       "4                NaN            NaN            NaN            NaN   \n",
       "...              ...            ...            ...            ...   \n",
       "84995            NaN            NaN            NaN            NaN   \n",
       "84996            NaN            NaN            NaN            NaN   \n",
       "84997            NaN            NaN            NaN            NaN   \n",
       "84998            NaN            NaN            NaN            NaN   \n",
       "84999            NaN            NaN            NaN            NaN   \n",
       "\n",
       "       Kerberos nonce LDAP scope  LDAP present  \n",
       "0                 NaN        NaN           NaN  \n",
       "1                 NaN        NaN           NaN  \n",
       "2                 NaN        NaN           NaN  \n",
       "3                 NaN        NaN           NaN  \n",
       "4                 NaN        NaN           NaN  \n",
       "...               ...        ...           ...  \n",
       "84995             NaN        NaN           NaN  \n",
       "84996             NaN        NaN           NaN  \n",
       "84997             NaN        NaN           NaN  \n",
       "84998             NaN        NaN           NaN  \n",
       "84999             NaN        NaN           NaN  \n",
       "\n",
       "[85000 rows x 76 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "b51074e8-a92c-46ff-99e6-c261cb8d54c0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3856262/586693513.py:1: DtypeWarning: Columns (16,18,36,48) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df2 = pd.read_csv('./CICIDS/CICIDS-3/output2.csv')\n"
     ]
    }
   ],
   "source": [
    "df2 = pd.read_csv('./CICIDS/CICIDS-3/output2.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "320038bd-5914-4b3f-8460-039782eb1fa0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>LDAP mechanism</th>\n",
       "      <th>DCE/RPC v5 rpc_vers</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499193e+09</td>\n",
       "      <td>185.49.134.3</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>33666.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>48.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb310800450005dc4985400030...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499256e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49672.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc15e3400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499190e+09</td>\n",
       "      <td>52.84.145.229</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.25</td>\n",
       "      <td>60532.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>245.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>002500a8c46000c1b114eb310800450005dc630f4000f5...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499256e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49672.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc3432400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499256e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49672.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc1e87400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214211</th>\n",
       "      <td>1.499280e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>444.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>45022.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>5844.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450016d42b3d400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214212</th>\n",
       "      <td>1.499280e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>444.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>45022.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450005dc2b3c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214213</th>\n",
       "      <td>1.499280e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>444.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>45022.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450005dc2b3c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214214</th>\n",
       "      <td>1.499280e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>444.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>45022.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>5844.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450016d42b38400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>214215</th>\n",
       "      <td>1.499280e+09</td>\n",
       "      <td>192.168.10.51</td>\n",
       "      <td>444.0</td>\n",
       "      <td>172.16.0.1</td>\n",
       "      <td>45022.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>5844.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f360ba80800450016d42b38400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>214216 rows × 69 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime          srcip  sport          dstip   dsport protocol_m  \\\n",
       "0       1.499193e+09   185.49.134.3  443.0  192.168.10.17  33666.0        tcp   \n",
       "1       1.499256e+09    13.107.4.50   80.0  192.168.10.15  49672.0        tcp   \n",
       "2       1.499190e+09  52.84.145.229  443.0  192.168.10.25  60532.0        tcp   \n",
       "3       1.499256e+09    13.107.4.50   80.0  192.168.10.15  49672.0        tcp   \n",
       "4       1.499256e+09    13.107.4.50   80.0  192.168.10.15  49672.0        tcp   \n",
       "...              ...            ...    ...            ...      ...        ...   \n",
       "214211  1.499280e+09  192.168.10.51  444.0     172.16.0.1  45022.0        tcp   \n",
       "214212  1.499280e+09  192.168.10.51  444.0     172.16.0.1  45022.0        tcp   \n",
       "214213  1.499280e+09  192.168.10.51  444.0     172.16.0.1  45022.0        tcp   \n",
       "214214  1.499280e+09  192.168.10.51  444.0     172.16.0.1  45022.0        tcp   \n",
       "214215  1.499280e+09  192.168.10.51  444.0     172.16.0.1  45022.0        tcp   \n",
       "\n",
       "         sttl  total_len first_layer  \\\n",
       "0        48.0     1500.0    Ethernet   \n",
       "1       118.0     1500.0    Ethernet   \n",
       "2       245.0     1500.0    Ethernet   \n",
       "3       118.0     1500.0    Ethernet   \n",
       "4       118.0     1500.0    Ethernet   \n",
       "...       ...        ...         ...   \n",
       "214211   64.0     5844.0    Ethernet   \n",
       "214212   64.0     1500.0    Ethernet   \n",
       "214213   64.0     1500.0    Ethernet   \n",
       "214214   64.0     5844.0    Ethernet   \n",
       "214215   64.0     5844.0    Ethernet   \n",
       "\n",
       "                                                   packet  ... LDAP mechanism  \\\n",
       "0       0023ae9b956700c1b114eb310800450005dc4985400030...  ...            NaN   \n",
       "1       001e4fd4ca2800c1b114eb310800450005dc15e3400076...  ...            NaN   \n",
       "2       002500a8c46000c1b114eb310800450005dc630f4000f5...  ...            NaN   \n",
       "3       001e4fd4ca2800c1b114eb310800450005dc3432400076...  ...            NaN   \n",
       "4       001e4fd4ca2800c1b114eb310800450005dc1e87400076...  ...            NaN   \n",
       "...                                                   ...  ...            ...   \n",
       "214211  00c1b114eb31b8ac6f360ba80800450016d42b3d400040...  ...            NaN   \n",
       "214212  00c1b114eb31b8ac6f360ba80800450005dc2b3c400040...  ...            NaN   \n",
       "214213  00c1b114eb31b8ac6f360ba80800450005dc2b3c400040...  ...            NaN   \n",
       "214214  00c1b114eb31b8ac6f360ba80800450016d42b38400040...  ...            NaN   \n",
       "214215  00c1b114eb31b8ac6f360ba80800450016d42b38400040...  ...            NaN   \n",
       "\n",
       "        DCE/RPC v5 rpc_vers  DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  \\\n",
       "0                       NaN               NaN                   NaN   \n",
       "1                       NaN               NaN                   NaN   \n",
       "2                       NaN               NaN                   NaN   \n",
       "3                       NaN               NaN                   NaN   \n",
       "4                       NaN               NaN                   NaN   \n",
       "...                     ...               ...                   ...   \n",
       "214211                  NaN               NaN                   NaN   \n",
       "214212                  NaN               NaN                   NaN   \n",
       "214213                  NaN               NaN                   NaN   \n",
       "214214                  NaN               NaN                   NaN   \n",
       "214215                  NaN               NaN                   NaN   \n",
       "\n",
       "       DCE/RPC v5 endian  DCE/RPC v5 encoding DCE/RPC v5 float  \\\n",
       "0                    NaN                  NaN              NaN   \n",
       "1                    NaN                  NaN              NaN   \n",
       "2                    NaN                  NaN              NaN   \n",
       "3                    NaN                  NaN              NaN   \n",
       "4                    NaN                  NaN              NaN   \n",
       "...                  ...                  ...              ...   \n",
       "214211               NaN                  NaN              NaN   \n",
       "214212               NaN                  NaN              NaN   \n",
       "214213               NaN                  NaN              NaN   \n",
       "214214               NaN                  NaN              NaN   \n",
       "214215               NaN                  NaN              NaN   \n",
       "\n",
       "        DCE/RPC v5 frag_len DCE/RPC v5 auth_len  DCE/RPC v5 call_id  \n",
       "0                       NaN                 NaN                 NaN  \n",
       "1                       NaN                 NaN                 NaN  \n",
       "2                       NaN                 NaN                 NaN  \n",
       "3                       NaN                 NaN                 NaN  \n",
       "4                       NaN                 NaN                 NaN  \n",
       "...                     ...                 ...                 ...  \n",
       "214211                  NaN                 NaN                 NaN  \n",
       "214212                  NaN                 NaN                 NaN  \n",
       "214213                  NaN                 NaN                 NaN  \n",
       "214214                  NaN                 NaN                 NaN  \n",
       "214215                  NaN                 NaN                 NaN  \n",
       "\n",
       "[214216 rows x 69 columns]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "83eacaf2-070b-4d19-b8eb-02ae08707cde",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3856262/3806815431.py:1: DtypeWarning: Columns (48) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df3 = pd.read_csv('./CICIDS/CICIDS-3/output3.csv')\n"
     ]
    }
   ],
   "source": [
    "df3 = pd.read_csv('./CICIDS/CICIDS-3/output3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "d658f885-0376-4b80-9d4a-6a93529d933f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>DCE/RPC v5 rpc_vers</th>\n",
       "      <th>DCE/RPC v5 ptype</th>\n",
       "      <th>DCE/RPC v5 pfc_flags</th>\n",
       "      <th>DCE/RPC v5 endian</th>\n",
       "      <th>DCE/RPC v5 encoding</th>\n",
       "      <th>DCE/RPC v5 float</th>\n",
       "      <th>DCE/RPC v5 frag_len</th>\n",
       "      <th>DCE/RPC v5 auth_len</th>\n",
       "      <th>DCE/RPC v5 call_id</th>\n",
       "      <th>SMB2 Negotiate Protocol Response load</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499453e+09</td>\n",
       "      <td>69.4.95.11</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>58402.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>50.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc05fe400032...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b9038ad400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc6449400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499430e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49808.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b907fa1400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499343e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49910.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb31080045000b90366a400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156714</th>\n",
       "      <td>1.499436e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>3051.0</td>\n",
       "      <td>205.174.165.73</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>238.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156715</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156716</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156717</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156718</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>156719 rows × 78 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime          srcip    sport           dstip   dsport  \\\n",
       "0       1.499453e+09     69.4.95.11     80.0   192.168.10.15  58402.0   \n",
       "1       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "2       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "3       1.499430e+09    13.107.4.50     80.0   192.168.10.15  49808.0   \n",
       "4       1.499343e+09    13.107.4.50     80.0   192.168.10.15  49910.0   \n",
       "...              ...            ...      ...             ...      ...   \n",
       "156714  1.499436e+09   192.168.10.8   3051.0  205.174.165.73   8080.0   \n",
       "156715  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "156716  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "156717  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "156718  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "\n",
       "       protocol_m   sttl  total_len first_layer  \\\n",
       "0             tcp   50.0     1500.0    Ethernet   \n",
       "1             tcp  118.0     2960.0    Ethernet   \n",
       "2             tcp  118.0     1500.0    Ethernet   \n",
       "3             tcp  118.0     2960.0    Ethernet   \n",
       "4             tcp  118.0     2960.0    Ethernet   \n",
       "...           ...    ...        ...         ...   \n",
       "156714        tcp  128.0      238.0    Ethernet   \n",
       "156715        tcp  237.0      308.0    Ethernet   \n",
       "156716        tcp  237.0      308.0    Ethernet   \n",
       "156717        tcp   64.0      465.0    Ethernet   \n",
       "156718        tcp   64.0      465.0    Ethernet   \n",
       "\n",
       "                                                   packet  ...  \\\n",
       "0       001e4fd4ca2800c1b114eb310800450005dc05fe400032...  ...   \n",
       "1       001e4fd4ca2800c1b114eb31080045000b9038ad400076...  ...   \n",
       "2       001e4fd4ca2800c1b114eb310800450005dc6449400076...  ...   \n",
       "3       001e4fd4ca2800c1b114eb31080045000b907fa1400076...  ...   \n",
       "4       001e4fd4ca2800c1b114eb31080045000b90366a400076...  ...   \n",
       "...                                                   ...  ...   \n",
       "156714  00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...  ...   \n",
       "156715  0023ae9b956700c1b114eb31080045000134db614000ed...  ...   \n",
       "156716  0023ae9b956700c1b114eb31080045000134db614000ed...  ...   \n",
       "156717  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...   \n",
       "156718  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...   \n",
       "\n",
       "       DCE/RPC v5 rpc_vers  DCE/RPC v5 ptype  DCE/RPC v5 pfc_flags  \\\n",
       "0                      NaN               NaN                   NaN   \n",
       "1                      NaN               NaN                   NaN   \n",
       "2                      NaN               NaN                   NaN   \n",
       "3                      NaN               NaN                   NaN   \n",
       "4                      NaN               NaN                   NaN   \n",
       "...                    ...               ...                   ...   \n",
       "156714                 NaN               NaN                   NaN   \n",
       "156715                 NaN               NaN                   NaN   \n",
       "156716                 NaN               NaN                   NaN   \n",
       "156717                 NaN               NaN                   NaN   \n",
       "156718                 NaN               NaN                   NaN   \n",
       "\n",
       "        DCE/RPC v5 endian DCE/RPC v5 encoding  DCE/RPC v5 float  \\\n",
       "0                     NaN                 NaN               NaN   \n",
       "1                     NaN                 NaN               NaN   \n",
       "2                     NaN                 NaN               NaN   \n",
       "3                     NaN                 NaN               NaN   \n",
       "4                     NaN                 NaN               NaN   \n",
       "...                   ...                 ...               ...   \n",
       "156714                NaN                 NaN               NaN   \n",
       "156715                NaN                 NaN               NaN   \n",
       "156716                NaN                 NaN               NaN   \n",
       "156717                NaN                 NaN               NaN   \n",
       "156718                NaN                 NaN               NaN   \n",
       "\n",
       "       DCE/RPC v5 frag_len  DCE/RPC v5 auth_len DCE/RPC v5 call_id  \\\n",
       "0                      NaN                  NaN                NaN   \n",
       "1                      NaN                  NaN                NaN   \n",
       "2                      NaN                  NaN                NaN   \n",
       "3                      NaN                  NaN                NaN   \n",
       "4                      NaN                  NaN                NaN   \n",
       "...                    ...                  ...                ...   \n",
       "156714                 NaN                  NaN                NaN   \n",
       "156715                 NaN                  NaN                NaN   \n",
       "156716                 NaN                  NaN                NaN   \n",
       "156717                 NaN                  NaN                NaN   \n",
       "156718                 NaN                  NaN                NaN   \n",
       "\n",
       "        SMB2 Negotiate Protocol Response load  \n",
       "0                                         NaN  \n",
       "1                                         NaN  \n",
       "2                                         NaN  \n",
       "3                                         NaN  \n",
       "4                                         NaN  \n",
       "...                                       ...  \n",
       "156714                                    NaN  \n",
       "156715                                    NaN  \n",
       "156716                                    NaN  \n",
       "156717                                    NaN  \n",
       "156718                                    NaN  \n",
       "\n",
       "[156719 rows x 78 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "45984331-2d8b-4ca8-adb2-447e50d43509",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df1, df2, df3], ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "d6db12e0-94f0-4579-a90c-bf72b60e2b96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stime</th>\n",
       "      <th>srcip</th>\n",
       "      <th>sport</th>\n",
       "      <th>dstip</th>\n",
       "      <th>dsport</th>\n",
       "      <th>protocol_m</th>\n",
       "      <th>sttl</th>\n",
       "      <th>total_len</th>\n",
       "      <th>first_layer</th>\n",
       "      <th>packet</th>\n",
       "      <th>...</th>\n",
       "      <th>LDAP referral</th>\n",
       "      <th>Kerberos from_</th>\n",
       "      <th>Kerberos till</th>\n",
       "      <th>Kerberos rtime</th>\n",
       "      <th>Kerberos nonce</th>\n",
       "      <th>LDAP scope</th>\n",
       "      <th>LDAP present</th>\n",
       "      <th>SMB Negotiate Extended Security Response (SMB) load</th>\n",
       "      <th>SMB2 LOGOFF Request reserved</th>\n",
       "      <th>SMB2 Negotiate Protocol Response load</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.499171e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>50580.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc2d27400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.499096e+09</td>\n",
       "      <td>35.186.253.0</td>\n",
       "      <td>443.0</td>\n",
       "      <td>192.168.10.25</td>\n",
       "      <td>55458.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1638.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>002500a8c46000c1b114eb31080045000666586800002d...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.499085e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>49647.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>118.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dc45f6400076...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.499173e+09</td>\n",
       "      <td>8.253.104.14</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.15</td>\n",
       "      <td>51929.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>54.0</td>\n",
       "      <td>1500.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>001e4fd4ca2800c1b114eb310800450005dcaec5000036...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.499171e+09</td>\n",
       "      <td>13.107.4.50</td>\n",
       "      <td>80.0</td>\n",
       "      <td>192.168.10.14</td>\n",
       "      <td>50128.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>119.0</td>\n",
       "      <td>2960.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>b8ac6f3607ee00c1b114eb31080045000b905f18400077...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455930</th>\n",
       "      <td>1.499436e+09</td>\n",
       "      <td>192.168.10.8</td>\n",
       "      <td>3051.0</td>\n",
       "      <td>205.174.165.73</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>128.0</td>\n",
       "      <td>238.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455931</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455932</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>237.0</td>\n",
       "      <td>308.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>0023ae9b956700c1b114eb31080045000134db614000ed...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455933</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>455934</th>\n",
       "      <td>1.499437e+09</td>\n",
       "      <td>192.168.10.17</td>\n",
       "      <td>48034.0</td>\n",
       "      <td>52.7.235.158</td>\n",
       "      <td>8080.0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>64.0</td>\n",
       "      <td>465.0</td>\n",
       "      <td>Ethernet</td>\n",
       "      <td>00c1b114eb310023ae9b95670800450001d19a5c400040...</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>455935 rows × 79 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               stime          srcip    sport           dstip   dsport  \\\n",
       "0       1.499171e+09    13.107.4.50     80.0   192.168.10.15  50580.0   \n",
       "1       1.499096e+09   35.186.253.0    443.0   192.168.10.25  55458.0   \n",
       "2       1.499085e+09    13.107.4.50     80.0   192.168.10.15  49647.0   \n",
       "3       1.499173e+09   8.253.104.14     80.0   192.168.10.15  51929.0   \n",
       "4       1.499171e+09    13.107.4.50     80.0   192.168.10.14  50128.0   \n",
       "...              ...            ...      ...             ...      ...   \n",
       "455930  1.499436e+09   192.168.10.8   3051.0  205.174.165.73   8080.0   \n",
       "455931  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "455932  1.499437e+09   52.7.235.158   8080.0   192.168.10.17  48034.0   \n",
       "455933  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "455934  1.499437e+09  192.168.10.17  48034.0    52.7.235.158   8080.0   \n",
       "\n",
       "       protocol_m   sttl  total_len first_layer  \\\n",
       "0             tcp  118.0     1500.0    Ethernet   \n",
       "1             tcp   45.0     1638.0    Ethernet   \n",
       "2             tcp  118.0     1500.0    Ethernet   \n",
       "3             tcp   54.0     1500.0    Ethernet   \n",
       "4             tcp  119.0     2960.0    Ethernet   \n",
       "...           ...    ...        ...         ...   \n",
       "455930        tcp  128.0      238.0    Ethernet   \n",
       "455931        tcp  237.0      308.0    Ethernet   \n",
       "455932        tcp  237.0      308.0    Ethernet   \n",
       "455933        tcp   64.0      465.0    Ethernet   \n",
       "455934        tcp   64.0      465.0    Ethernet   \n",
       "\n",
       "                                                   packet  ... LDAP referral  \\\n",
       "0       001e4fd4ca2800c1b114eb310800450005dc2d27400076...  ...           NaN   \n",
       "1       002500a8c46000c1b114eb31080045000666586800002d...  ...           NaN   \n",
       "2       001e4fd4ca2800c1b114eb310800450005dc45f6400076...  ...           NaN   \n",
       "3       001e4fd4ca2800c1b114eb310800450005dcaec5000036...  ...           NaN   \n",
       "4       b8ac6f3607ee00c1b114eb31080045000b905f18400077...  ...           NaN   \n",
       "...                                                   ...  ...           ...   \n",
       "455930  00c1b114eb31b8ac6f3608f50800450000ee2e9f400080...  ...           NaN   \n",
       "455931  0023ae9b956700c1b114eb31080045000134db614000ed...  ...           NaN   \n",
       "455932  0023ae9b956700c1b114eb31080045000134db614000ed...  ...           NaN   \n",
       "455933  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...           NaN   \n",
       "455934  00c1b114eb310023ae9b95670800450001d19a5c400040...  ...           NaN   \n",
       "\n",
       "        Kerberos from_  Kerberos till  Kerberos rtime Kerberos nonce  \\\n",
       "0                  NaN            NaN             NaN            NaN   \n",
       "1                  NaN            NaN             NaN            NaN   \n",
       "2                  NaN            NaN             NaN            NaN   \n",
       "3                  NaN            NaN             NaN            NaN   \n",
       "4                  NaN            NaN             NaN            NaN   \n",
       "...                ...            ...             ...            ...   \n",
       "455930             NaN            NaN             NaN            NaN   \n",
       "455931             NaN            NaN             NaN            NaN   \n",
       "455932             NaN            NaN             NaN            NaN   \n",
       "455933             NaN            NaN             NaN            NaN   \n",
       "455934             NaN            NaN             NaN            NaN   \n",
       "\n",
       "        LDAP scope LDAP present  \\\n",
       "0              NaN          NaN   \n",
       "1              NaN          NaN   \n",
       "2              NaN          NaN   \n",
       "3              NaN          NaN   \n",
       "4              NaN          NaN   \n",
       "...            ...          ...   \n",
       "455930         NaN          NaN   \n",
       "455931         NaN          NaN   \n",
       "455932         NaN          NaN   \n",
       "455933         NaN          NaN   \n",
       "455934         NaN          NaN   \n",
       "\n",
       "        SMB Negotiate Extended Security Response (SMB) load  \\\n",
       "0                                                     NaN     \n",
       "1                                                     NaN     \n",
       "2                                                     NaN     \n",
       "3                                                     NaN     \n",
       "4                                                     NaN     \n",
       "...                                                   ...     \n",
       "455930                                                NaN     \n",
       "455931                                                NaN     \n",
       "455932                                                NaN     \n",
       "455933                                                NaN     \n",
       "455934                                                NaN     \n",
       "\n",
       "       SMB2 LOGOFF Request reserved  SMB2 Negotiate Protocol Response load  \n",
       "0                               NaN                                    NaN  \n",
       "1                               NaN                                    NaN  \n",
       "2                               NaN                                    NaN  \n",
       "3                               NaN                                    NaN  \n",
       "4                               NaN                                    NaN  \n",
       "...                             ...                                    ...  \n",
       "455930                          NaN                                    NaN  \n",
       "455931                          NaN                                    NaN  \n",
       "455932                          NaN                                    NaN  \n",
       "455933                          NaN                                    NaN  \n",
       "455934                          NaN                                    NaN  \n",
       "\n",
       "[455935 rows x 79 columns]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "fd56d79d-ad88-4b8e-a8af-3661ac53eeba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BENIGN                        50000\n",
       "FTP-Patator                   50000\n",
       "SSH-Patator                   50000\n",
       "DoS Hulk                      50000\n",
       "DDoS                          50000\n",
       "Infiltration                  50000\n",
       "Heartbleed                    41285\n",
       "DoS GoldenEye                 36151\n",
       "Web Attack – Brute Force      28922\n",
       "DoS slowloris                 25139\n",
       "DoS Slowhttptest              11641\n",
       "Web Attack – XSS               6767\n",
       "Bot                            5147\n",
       "PortScan                        838\n",
       "Web Attack – Sql Injection       45\n",
       "Name: attack_cat, dtype: int64"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['attack_cat'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "26ecdaa4-cb29-41b6-9730-b1fb2ed37a15",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('./CICIDS/CICIDS-4/output.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e20bd4e-a27e-4303-ac9c-aa4c9c41623e",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfa8b5ae-2988-4de6-a817-c59e674a80ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3710999/511977792.py:7: DtypeWarning: Columns (16,18,36,47,48,50,53,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,72,80,85,86,87,88,89,90,92,96,97,98,99,100,101,102,103,104,105,106,107,112,116,120,121,122,123,124,125,126,127,128,132,133,135,138,142,147,148,149,150,151,152,153,154,155,157,158,159,160,161,162,171,173,178,179,180,181,182,183,184,185,187,188,190,191,192,193,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,226,231,232,233,234,235,237,238,239,240,243) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
      "/tmp/ipykernel_3710999/511977792.py:7: DtypeWarning: Columns (16,18,37,47,49,50,52,55,63,68,69,70,71,72,73,75,76,77,78,79,82,83,84,85,86,87,88,90,95,96,97,98,99,100,101,102,108,112,113,114,115,116,117,118,119,120,121,122,123,124,125,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,146,150,153,154,155,156,157,160,161,162,164,165,166,169,170,171,172,173,174,177,181,186,187,188,189,190,191,193,194,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,213,221) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
      "/tmp/ipykernel_3710999/511977792.py:7: DtypeWarning: Columns (10,16,18,37,47,49,50,51,53,54,56,61,62,63,64,65,68,76,81,82,83,84,85,86,88,89,90,91,94,95,96,97,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,129,130,131,132,133,137,138,139,140,141,142,143,144,145,146,147,148,149,154,156,159,160,161,168,172,177,178,179,180,181,182,183,184,187,188,189,190,191,193,194) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  temp_df = pd.read_csv(f'./CICIDS/output{i}.csv')\n"
     ]
    }
   ],
   "source": [
    "k = 1\n",
    "df = pd.DataFrame()\n",
    "for i in range(0,12):\n",
    "    if not os.path.isfile(f'./CICIDS/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')\n",
    "    temp_df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
    "    df = pd.concat([df, temp_df], ignore_index=True)\n",
    "    write_log(f'------------ CSV File {i} added to DataFrame ------------')\n",
    "    if (i%3 == 0 and i!=9) or i==10:\n",
    "        df.to_csv(f'./CICIDS/CICIDS-EDA/output{k}.csv', index=False)\n",
    "        print(df['attack_cat'].value_counts())\n",
    "        df = pd.DataFrame()\n",
    "        k = k+1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5bae122-d771-4c32-b248-bc18db345c44",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Exporting Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4928aece-dfb9-48be-b3ba-a5cdaf9ddddf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def hex_to_dec(hex_str):\n",
    "    try:\n",
    "        return [int(hex_str[i:i+2], 16) for i in range(0, len(hex_str), 2)]\n",
    "    except ValueError:\n",
    "        return [0,0]\n",
    "\n",
    "start_value = 16952481\n",
    "k = 5\n",
    "flow1 = pd.read_csv('./CICIDS/Export/CICIDS_Flow.csv')\n",
    "flow1 = flow1[['flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']]\n",
    "flow2 = flow1.copy()\n",
    "flow2.rename(columns={'source_ip': 'destination_ip', 'destination_ip': 'source_ip', 'source_port': 'destination_port', 'destination_port': 'source_port'}, inplace=True)\n",
    "flow = pd.concat([flow1, flow2])\n",
    "flow.drop_duplicates(subset=flow.columns.difference(['flow_id']), inplace=True)\n",
    "\n",
    "for i in range(3, 20):\n",
    "    \n",
    "    if not os.path.isfile(f'./CICIDS/output{i}.csv'):\n",
    "        continue\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\\n')\n",
    "    main_df = pd.read_csv(f'./CICIDS/output{i}.csv')\n",
    "    for j in range(2):\n",
    "        if (i==10 or i==6) and j==0:\n",
    "            df = main_df\n",
    "        elif (i==10 or i==6) and j==1:\n",
    "            continue\n",
    "        else:\n",
    "            rows = len(main_df)//2\n",
    "            if j==0:\n",
    "                df = main_df.head(rows)\n",
    "            elif j==1:\n",
    "                df = main_df.tail(len(main_df)-rows)\n",
    "        write_log(f'------------ BEGIN PACKET FLOW MERGE ------------')\n",
    "        df.rename(columns={'packet': 'packet_hex', 'payload': 'payload_hex', 'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'protocol_m': 'protocol'}, inplace=True)\n",
    "        df['source_port'] = df['source_port'].astype(int)\n",
    "        df['destination_port'] = df['destination_port'].astype(int)\n",
    "        columns_to_match = ['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']\n",
    "        df = df.merge(flow, on=columns_to_match, how='left')\n",
    "        flow_id = df.pop('flow_id')\n",
    "        df.insert(0, 'flow_id', flow_id)\n",
    "        end_value = start_value + len(df)\n",
    "        df['packet_id'] = range(start_value, end_value)\n",
    "        df['payload_length'] = df.payload_hex.apply(lambda x: len(x)//2 if isinstance(x, str) else 0)\n",
    "        write_log(f'------------ END PACKET FLOW MERGE ------------')\n",
    "\n",
    "        write_log(f'------------ BEGIN PAYLOAD BYTES PROCESSING ------------')\n",
    "        df1 = df.dropna(subset='Raw load')\n",
    "        df1.reset_index(drop=True, inplace=True)\n",
    "        dec_data = df1['payload_hex'].apply(lambda x: hex_to_dec(str(x)[:3000]))\n",
    "        max_len = dec_data.apply(len).max()\n",
    "        df_final = pd.DataFrame(dec_data.tolist(), columns=[f'payload_byte_{i}' for i in range(1,max_len+1)])\n",
    "        df_final = pd.concat([df1[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)\n",
    "        df_final['attack_label'] = df1['attack_cat']\n",
    "        df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "        write_log(f'------------ END PAYLOAD BYTES PROCESSING ------------')\n",
    "\n",
    "        df_final.to_csv(f'./CICIDS/Export/Payload-Bytes/Payload_Bytes_File_{k}.csv', index=False)\n",
    "        write_log(f'------------ CSV File {i}:{j}:{k} Saved for Payload Bytes ------------')\n",
    "\n",
    "        del dec_data\n",
    "        del df_final\n",
    "        del df1\n",
    "        \n",
    "        write_log(f'------------ BEGIN PACKET BYTES PROCESSING ------------')\n",
    "        dec_data = df['packet_hex'].apply(lambda x: hex_to_dec(x[:3200]))\n",
    "        max_len = dec_data.apply(len).max()\n",
    "        df_final = pd.DataFrame(dec_data.tolist(), columns=[f'packet_byte_{i}' for i in range(1,max_len+1)])\n",
    "        df_final = pd.concat([df[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)\n",
    "        df_final['attack_label'] = df['attack_cat']\n",
    "        df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "        write_log(f'------------ END PACKET BYTES PROCESSING ------------')\n",
    "\n",
    "        df_final.to_csv(f'./CICIDS/Export/Packet-Bytes/Packet_Bytes_File_{k}.csv', index=False)\n",
    "        write_log(f'------------ CSV File {i}:{j}:{k} Saved for Packet Bytes ------------')\n",
    "        \n",
    "        del dec_data\n",
    "        del df_final\n",
    "\n",
    "        write_log(f'------------ BEGIN PACKET FIELDS PROCESSING ------------')\n",
    "        df.drop(['stime', 'sttl', 'total_len', 'first_layer', 't_delta', 'stime_flow', 'duration', 'offset', 'srcip_flow', 'sport_flow', 'dstip_flow', 'dsport_flow', 'label', 'payload_length'], axis=1, inplace=True)\n",
    "        attack_label = df.pop('attack_cat')\n",
    "        df.insert(len(df.columns), 'attack_label', attack_label)\n",
    "        packet_id = df.pop('packet_id')\n",
    "        df.insert(0, 'packet_id', packet_id)\n",
    "        df.drop_duplicates(subset=df.columns.difference(['packet_id', 'flow_id']), inplace=True)\n",
    "        start_value += len(df)\n",
    "        counts = '\\n'.join([f'{key}:{value}' for key, value in df.attack_label.value_counts().to_dict().items()])\n",
    "        f = open(\"CICIDS_INFO.txt\", \"a\")\n",
    "        f.write(f'TOTAL PACKETS IN CSV FILE {k}: {df.shape}\\n')\n",
    "        f.write(f'ATTACK LABELS IN CSV FILE {k}\\n' + counts + '\\n\\n')\n",
    "        f.close()\n",
    "        write_log(f'------------ END PACKET FIELDS PROCESSING ------------')\n",
    "\n",
    "        df.to_csv(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{k}.csv', index=False)\n",
    "        write_log(f'------------ CSV File {i}:{j}:{k} Saved for Packet Fields ------------\\n')\n",
    "        k+=1\n",
    "        \n",
    "        del df\n",
    "        \n",
    "    del main_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "708f05b4-38bd-42ab-a1ea-d0f46fc41183",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(1,19):\n",
    "    file_name_1 = f\"./CICIDS/Export/Payload-Bytes/Payload_Bytes_File_{i}.zip\"\n",
    "    !unzip {file_name_1}\n",
    "    \n",
    "    file_name_2 = f\"./CICIDS/Export/Packet-Bytes/Packet_Bytes_File_{i}.zip\"\n",
    "    !unzip {file_name_2}\n",
    "    \n",
    "    file_name_3 = f\"./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.zip\"\n",
    "    !unzip {file_name_3}\n",
    "    \n",
    "    write_log(f\"FILE {i} UNZIPPED SUCCESSFULLY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d54542d0-f1b9-48ce-b45d-0e34119c2c17",
   "metadata": {},
   "source": [
    "# Uploading Files to Hugging Face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6155ffae-778a-49da-a784-24d813e8cc31",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import login, HfApi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0bf5eb9d-3425-4fb3-89f8-a862da598945",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6d23532a1d8f4e49b700bf999761f741",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f918d9f5-2768-4444-be41-adcffa97f679",
   "metadata": {},
   "outputs": [],
   "source": [
    "api = HfApi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "02039c11-c5e0-41ee-8ca6-9df3756d7a95",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/rajkumardheivanayahi/.local/lib/python3.9/site-packages/huggingface_hub/utils/_experimental.py:57: UserWarning: 'plan_multi_commits' is experimental and might be subject to breaking changes in the future. You can disable this warning by setting `HF_HUB_DISABLE_EXPERIMENTAL_WARNING=1` as environment variable.\n",
      "  warnings.warn(\n",
      "/data/rajkumardheivanayahi/.local/lib/python3.9/site-packages/huggingface_hub/utils/_experimental.py:57: UserWarning: 'HfApi.create_commits_on_pr' is experimental and might be subject to breaking changes in the future. You can disable this warning by setting `HF_HUB_DISABLE_EXPERIMENTAL_WARNING=1` as environment variable.\n",
      "  warnings.warn(\n",
      "Will create 0 deletion commit(s) and 18 addition commit(s), totalling 18 atomic operations.\n",
      "Multi-commits strategy with ID 7c904ad71385d209b0053f17d71f906afb351392bb35333dd1883c882405794c.\n",
      "PR already exists: https://huggingface.co/datasets/rdpahalavan/CIC-IDS2017/discussions/6. Will resume process where it stopped.\n",
      "Found 9 existing commits on the PR.\n",
      "9 commits remaining (0 deletion commits and 9 addition commits)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a7208d50abd6411ca6125521bd030992",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_2.parquet:   0%|          | 0.00/14.5G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 918501eb5bdfdfcdd0ad2c6ee9b89eee6bb4a1b11e137f3a0fce0eabb47d979e completed (still 8 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4598884cbe344638aba41cfd0ca95075",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_3.parquet:   0%|          | 0.00/13.6G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 147fac4088ff613c70a50a1f31669161860e9281c10ae7c7f03d62cb47dd32da completed (still 7 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dcb0efacaec74c4fa8f121deddf45165",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_4.parquet:   0%|          | 0.00/19.8G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 1421260259428b0f5666c7998eeea30cb55bf32fef97844161525f4b3c895ab5 completed (still 6 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2442a2a80f1a4a5bb14b177c5badeb61",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_5.parquet:   0%|          | 0.00/21.0G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 887c5610b18ebb2690de9aec09ae96ec3a862a34d5ad10cbfbcb1ad7a3807b2f completed (still 5 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f0f10f0cbaa643e3867a1b1a1c91d7d7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_6.parquet:   0%|          | 0.00/9.84G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step ea50560679d3386b703a9c1ee7602c8135c97c0e17081430e0440fa5fc236519 completed (still 4 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7fe344580acf49b48675f81dc6466029",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_7.parquet:   0%|          | 0.00/12.1G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step 408de3027229d759088eb36f8b544fe74a78e1475cffc5e6dc03d2236eee0fa1 completed (still 3 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "197988988b3045b29c1a421820a7eadf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_8.parquet:   0%|          | 0.00/19.9G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step e607dee63049e91b0f73016edff621128dfb0be4a9cce802c305b5f32292cb2d completed (still 2 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5ee27e42c506464fb449f7b2adb1e7c9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_9.parquet:   0%|          | 0.00/19.9G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step b3b2d9e68d720c614ad5f0e8ca7a720ee6ec146e007f3335f48dc20e874b9b8b completed (still 1 to go).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f54b802aad5a4b93851dcecb014f9a37",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Packet_Fields_File_18.parquet:   0%|          | 0.00/1.52G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  step d0e05e89fcb38a137c5e27506454ba93471d7ac662932ea8de41da5612bffb61 completed (still 0 to go).\n",
      "All commits have been pushed.\n",
      "PR is now open for reviews.\n",
      "PR has been automatically merged (`merge_pr=True` was passed).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/datasets/rdpahalavan/CIC-IDS2017/tree/main/Packet-Fields/'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "api.upload_folder(\n",
    "    folder_path=\"./CICIDS/Export/Packet-Fields\",\n",
    "    repo_id=\"rdpahalavan/CIC-IDS2017\",\n",
    "    path_in_repo=\"Packet-Fields/\",\n",
    "    repo_type=\"dataset\",\n",
    "    allow_patterns=\"*.parquet\",\n",
    "    delete_patterns=\"*.parquet\",\n",
    "    multi_commits=True,\n",
    "    multi_commits_verbose=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a06f747-e91b-4ef2-9dcd-ddc660e8299a",
   "metadata": {},
   "source": [
    "# CSV to Parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbf3cbb8-b98a-45c2-a57a-7186ccd671af",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(1,19):\n",
    "    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\\n')\n",
    "    df = pd.read_csv(f'./CICIDS/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    df.to_csv(f'./CICIDS/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./CICIDS/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Payload Bytes Exported ------------')\n",
    "    df = pd.read_csv(f'./CICIDS/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    df.to_csv(f'./CICIDS/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./CICIDS/Export/Packet-Bytes/Packet_Bytes_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Packet Bytes Exported ------------')\n",
    "    df = pd.read_csv(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.csv', low_memory=False)\n",
    "    df = df.convert_dtypes()\n",
    "    df.to_csv(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.csv', index=False)\n",
    "    df.to_parquet(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    write_log(f'------------ Packet Fields Exported ------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d9899ab0-fbd6-49c3-8567-a4711d569632",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(f'./CICIDS/Export/Network-Flows/CICIDS_Flow.csv', low_memory=False)\n",
    "df = df.convert_dtypes()\n",
    "df.to_csv(f'./CICIDS/Export/Network-Flows/CICIDS_Flow.csv', index=False)\n",
    "df.to_parquet(f'./CICIDS/Export/Network-Flows/CICIDS_Flow.parquet', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a2fb78f2-5394-486b-bd16-2c17b84e2dd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "all_columns = set()\n",
    "\n",
    "for i in range(1,19):\n",
    "    df = pd.read_csv(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.csv', nrows=0)\n",
    "    all_columns = all_columns.union(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3c3b9f37-f763-458c-97f2-11d49bd8fbba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "261"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(all_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "66be7040-327a-486b-b6b8-db7b4f8b3d17",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 1\n",
      "Completed: 2\n",
      "Completed: 3\n",
      "Completed: 4\n",
      "Completed: 5\n",
      "Completed: 6\n",
      "Completed: 7\n",
      "Completed: 8\n",
      "Completed: 9\n",
      "Completed: 10\n",
      "Completed: 11\n",
      "Completed: 12\n",
      "Completed: 13\n",
      "Completed: 14\n",
      "Completed: 15\n",
      "Completed: 16\n",
      "Completed: 17\n",
      "Completed: 18\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,19):\n",
    "    df = pd.read_parquet(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.parquet')\n",
    "    missing_cols = all_columns - set(df.columns)\n",
    "    df_missing_cols = pd.DataFrame({col: np.nan for col in missing_cols}, index=df.index)\n",
    "    df_missing_cols = df_missing_cols.convert_dtypes()\n",
    "    df = pd.concat([df, df_missing_cols], axis=1)\n",
    "    df.to_parquet(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)\n",
    "    del df_missing_cols\n",
    "    del df\n",
    "    print(f\"Completed: {i}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3f4151a6-8d53-4ecd-9072-301c25f6e990",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: Packet_Fields_File_1.parquet\n",
      "Completed: Packet_Fields_File_2.parquet\n",
      "Completed: Packet_Fields_File_3.parquet\n",
      "Completed: Packet_Fields_File_4.parquet\n",
      "Completed: Packet_Fields_File_5.parquet\n",
      "Completed: Packet_Fields_File_6.parquet\n",
      "Completed: Packet_Fields_File_7.parquet\n",
      "Completed: Packet_Fields_File_8.parquet\n",
      "Completed: Packet_Fields_File_9.parquet\n",
      "Completed: Packet_Fields_File_10.parquet\n",
      "Completed: Packet_Fields_File_11.parquet\n",
      "Completed: Packet_Fields_File_12.parquet\n",
      "Completed: Packet_Fields_File_13.parquet\n",
      "Completed: Packet_Fields_File_14.parquet\n",
      "Completed: Packet_Fields_File_15.parquet\n",
      "Completed: Packet_Fields_File_16.parquet\n",
      "Completed: Packet_Fields_File_17.parquet\n",
      "Completed: Packet_Fields_File_18.parquet\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import pyarrow.parquet as pq\n",
    "from collections import defaultdict\n",
    "\n",
    "files = [f\"Packet_Fields_File_{i}.parquet\" for i in range(1,19)]\n",
    "\n",
    "column_types = defaultdict(list)\n",
    "\n",
    "for file in files:\n",
    "    df = pd.read_parquet(\"./CICIDS/Export/Packet-Fields/\"+file)\n",
    "    for column, dtype in df.dtypes.items():\n",
    "        column_types[column].append(dtype)\n",
    "    print(f\"Completed: {file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2092f60f-8594-4ea7-a44a-9c10dcae9279",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "majority_column_types = {}\n",
    "\n",
    "for column, types in column_types.items():\n",
    "    counter = Counter(types)\n",
    "    majority_type = counter.most_common(1)[0][0]\n",
    "    majority_column_types[column] = majority_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a3177082-102a-4a00-a46e-a5633f575fcb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Completed: 1\n",
      "Completed: 2\n",
      "Completed: 3\n",
      "Completed: 4\n",
      "Completed: 5\n",
      "Completed: 6\n",
      "Completed: 7\n",
      "Completed: 8\n",
      "Completed: 9\n",
      "Completed: 10\n",
      "Completed: 11\n",
      "Completed: 12\n",
      "Completed: 13\n",
      "Completed: 14\n",
      "Completed: 15\n",
      "Completed: 16\n",
      "Completed: 17\n",
      "Completed: 18\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,19):\n",
    "    df = pd.read_parquet(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.parquet')\n",
    "    for column, dtype in df.dtypes.items():\n",
    "        majority_type = majority_column_types[column]\n",
    "        if dtype != majority_type:\n",
    "            df[column] = df[column].astype(majority_type)\n",
    "    df.to_parquet(f'./CICIDS/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)\n",
    "    del df\n",
    "    print(f\"Completed: {i}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
